{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "from scipy.stats import skew\n",
    "\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "from IPython.display import display\n",
    "#导入需要使用到的工具包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>MSZoning</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>Street</th>\n",
       "      <th>Alley</th>\n",
       "      <th>LotShape</th>\n",
       "      <th>LandContour</th>\n",
       "      <th>Utilities</th>\n",
       "      <th>...</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>PoolQC</th>\n",
       "      <th>Fence</th>\n",
       "      <th>MiscFeature</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "      <th>SaleType</th>\n",
       "      <th>SaleCondition</th>\n",
       "      <th>SalePrice</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>65.0</td>\n",
       "      <td>8450</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>208500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>20</td>\n",
       "      <td>RL</td>\n",
       "      <td>80.0</td>\n",
       "      <td>9600</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>2007</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>181500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>68.0</td>\n",
       "      <td>11250</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>223500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>70</td>\n",
       "      <td>RL</td>\n",
       "      <td>60.0</td>\n",
       "      <td>9550</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2006</td>\n",
       "      <td>WD</td>\n",
       "      <td>Abnorml</td>\n",
       "      <td>140000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>84.0</td>\n",
       "      <td>14260</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>250000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 81 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n",
       "0   1          60       RL         65.0     8450   Pave   NaN      Reg   \n",
       "1   2          20       RL         80.0     9600   Pave   NaN      Reg   \n",
       "2   3          60       RL         68.0    11250   Pave   NaN      IR1   \n",
       "3   4          70       RL         60.0     9550   Pave   NaN      IR1   \n",
       "4   5          60       RL         84.0    14260   Pave   NaN      IR1   \n",
       "\n",
       "  LandContour Utilities    ...     PoolArea PoolQC Fence MiscFeature MiscVal  \\\n",
       "0         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   \n",
       "1         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   \n",
       "2         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   \n",
       "3         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   \n",
       "4         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   \n",
       "\n",
       "  MoSold YrSold  SaleType  SaleCondition  SalePrice  \n",
       "0      2   2008        WD         Normal     208500  \n",
       "1      5   2007        WD         Normal     181500  \n",
       "2      9   2008        WD         Normal     223500  \n",
       "3      2   2006        WD        Abnorml     140000  \n",
       "4     12   2008        WD         Normal     250000  \n",
       "\n",
       "[5 rows x 81 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv(\"Ames_House_train.csv\")\n",
    "train.head()\n",
    "#把训练数据导入进来，然后查看他的前五行的数据，进行观察。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1460 entries, 0 to 1459\n",
      "Data columns (total 81 columns):\n",
      "Id               1460 non-null int64\n",
      "MSSubClass       1460 non-null int64\n",
      "MSZoning         1460 non-null object\n",
      "LotFrontage      1201 non-null float64\n",
      "LotArea          1460 non-null int64\n",
      "Street           1460 non-null object\n",
      "Alley            91 non-null object\n",
      "LotShape         1460 non-null object\n",
      "LandContour      1460 non-null object\n",
      "Utilities        1460 non-null object\n",
      "LotConfig        1460 non-null object\n",
      "LandSlope        1460 non-null object\n",
      "Neighborhood     1460 non-null object\n",
      "Condition1       1460 non-null object\n",
      "Condition2       1460 non-null object\n",
      "BldgType         1460 non-null object\n",
      "HouseStyle       1460 non-null object\n",
      "OverallQual      1460 non-null int64\n",
      "OverallCond      1460 non-null int64\n",
      "YearBuilt        1460 non-null int64\n",
      "YearRemodAdd     1460 non-null int64\n",
      "RoofStyle        1460 non-null object\n",
      "RoofMatl         1460 non-null object\n",
      "Exterior1st      1460 non-null object\n",
      "Exterior2nd      1460 non-null object\n",
      "MasVnrType       1452 non-null object\n",
      "MasVnrArea       1452 non-null float64\n",
      "ExterQual        1460 non-null object\n",
      "ExterCond        1460 non-null object\n",
      "Foundation       1460 non-null object\n",
      "BsmtQual         1423 non-null object\n",
      "BsmtCond         1423 non-null object\n",
      "BsmtExposure     1422 non-null object\n",
      "BsmtFinType1     1423 non-null object\n",
      "BsmtFinSF1       1460 non-null int64\n",
      "BsmtFinType2     1422 non-null object\n",
      "BsmtFinSF2       1460 non-null int64\n",
      "BsmtUnfSF        1460 non-null int64\n",
      "TotalBsmtSF      1460 non-null int64\n",
      "Heating          1460 non-null object\n",
      "HeatingQC        1460 non-null object\n",
      "CentralAir       1460 non-null object\n",
      "Electrical       1459 non-null object\n",
      "1stFlrSF         1460 non-null int64\n",
      "2ndFlrSF         1460 non-null int64\n",
      "LowQualFinSF     1460 non-null int64\n",
      "GrLivArea        1460 non-null int64\n",
      "BsmtFullBath     1460 non-null int64\n",
      "BsmtHalfBath     1460 non-null int64\n",
      "FullBath         1460 non-null int64\n",
      "HalfBath         1460 non-null int64\n",
      "BedroomAbvGr     1460 non-null int64\n",
      "KitchenAbvGr     1460 non-null int64\n",
      "KitchenQual      1460 non-null object\n",
      "TotRmsAbvGrd     1460 non-null int64\n",
      "Functional       1460 non-null object\n",
      "Fireplaces       1460 non-null int64\n",
      "FireplaceQu      770 non-null object\n",
      "GarageType       1379 non-null object\n",
      "GarageYrBlt      1379 non-null float64\n",
      "GarageFinish     1379 non-null object\n",
      "GarageCars       1460 non-null int64\n",
      "GarageArea       1460 non-null int64\n",
      "GarageQual       1379 non-null object\n",
      "GarageCond       1379 non-null object\n",
      "PavedDrive       1460 non-null object\n",
      "WoodDeckSF       1460 non-null int64\n",
      "OpenPorchSF      1460 non-null int64\n",
      "EnclosedPorch    1460 non-null int64\n",
      "3SsnPorch        1460 non-null int64\n",
      "ScreenPorch      1460 non-null int64\n",
      "PoolArea         1460 non-null int64\n",
      "PoolQC           7 non-null object\n",
      "Fence            281 non-null object\n",
      "MiscFeature      54 non-null object\n",
      "MiscVal          1460 non-null int64\n",
      "MoSold           1460 non-null int64\n",
      "YrSold           1460 non-null int64\n",
      "SaleType         1460 non-null object\n",
      "SaleCondition    1460 non-null object\n",
      "SalePrice        1460 non-null int64\n",
      "dtypes: float64(3), int64(35), object(43)\n",
      "memory usage: 924.0+ KB\n"
     ]
    }
   ],
   "source": [
    "train.info()\n",
    "#查看训练数据的基本信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>MSZoning</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>Street</th>\n",
       "      <th>Alley</th>\n",
       "      <th>LotShape</th>\n",
       "      <th>LandContour</th>\n",
       "      <th>Utilities</th>\n",
       "      <th>...</th>\n",
       "      <th>ScreenPorch</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>PoolQC</th>\n",
       "      <th>Fence</th>\n",
       "      <th>MiscFeature</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "      <th>SaleType</th>\n",
       "      <th>SaleCondition</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1461</td>\n",
       "      <td>20</td>\n",
       "      <td>RH</td>\n",
       "      <td>80.0</td>\n",
       "      <td>11622</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>120</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>MnPrv</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>2010</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1462</td>\n",
       "      <td>20</td>\n",
       "      <td>RL</td>\n",
       "      <td>81.0</td>\n",
       "      <td>14267</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Gar2</td>\n",
       "      <td>12500</td>\n",
       "      <td>6</td>\n",
       "      <td>2010</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1463</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>74.0</td>\n",
       "      <td>13830</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>MnPrv</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2010</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1464</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>78.0</td>\n",
       "      <td>9978</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>2010</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1465</td>\n",
       "      <td>120</td>\n",
       "      <td>RL</td>\n",
       "      <td>43.0</td>\n",
       "      <td>5005</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>HLS</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>144</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2010</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 80 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n",
       "0  1461          20       RH         80.0    11622   Pave   NaN      Reg   \n",
       "1  1462          20       RL         81.0    14267   Pave   NaN      IR1   \n",
       "2  1463          60       RL         74.0    13830   Pave   NaN      IR1   \n",
       "3  1464          60       RL         78.0     9978   Pave   NaN      IR1   \n",
       "4  1465         120       RL         43.0     5005   Pave   NaN      IR1   \n",
       "\n",
       "  LandContour Utilities      ...       ScreenPorch PoolArea PoolQC  Fence  \\\n",
       "0         Lvl    AllPub      ...               120        0    NaN  MnPrv   \n",
       "1         Lvl    AllPub      ...                 0        0    NaN    NaN   \n",
       "2         Lvl    AllPub      ...                 0        0    NaN  MnPrv   \n",
       "3         Lvl    AllPub      ...                 0        0    NaN    NaN   \n",
       "4         HLS    AllPub      ...               144        0    NaN    NaN   \n",
       "\n",
       "  MiscFeature MiscVal MoSold  YrSold  SaleType  SaleCondition  \n",
       "0         NaN       0      6    2010        WD         Normal  \n",
       "1        Gar2   12500      6    2010        WD         Normal  \n",
       "2         NaN       0      3    2010        WD         Normal  \n",
       "3         NaN       0      6    2010        WD         Normal  \n",
       "4         NaN       0      1    2010        WD         Normal  \n",
       "\n",
       "[5 rows x 80 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test = pd.read_csv(\"Ames_House_test.csv\")\n",
    "test.head()\n",
    "#把测试数据导入进来，然后查看他的前五行的数据，进行观察。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1460 entries, 0 to 1459\n",
      "Data columns (total 81 columns):\n",
      "Id               1460 non-null int64\n",
      "MSSubClass       1460 non-null int64\n",
      "MSZoning         1460 non-null object\n",
      "LotFrontage      1201 non-null float64\n",
      "LotArea          1460 non-null int64\n",
      "Street           1460 non-null object\n",
      "Alley            91 non-null object\n",
      "LotShape         1460 non-null object\n",
      "LandContour      1460 non-null object\n",
      "Utilities        1460 non-null object\n",
      "LotConfig        1460 non-null object\n",
      "LandSlope        1460 non-null object\n",
      "Neighborhood     1460 non-null object\n",
      "Condition1       1460 non-null object\n",
      "Condition2       1460 non-null object\n",
      "BldgType         1460 non-null object\n",
      "HouseStyle       1460 non-null object\n",
      "OverallQual      1460 non-null int64\n",
      "OverallCond      1460 non-null int64\n",
      "YearBuilt        1460 non-null int64\n",
      "YearRemodAdd     1460 non-null int64\n",
      "RoofStyle        1460 non-null object\n",
      "RoofMatl         1460 non-null object\n",
      "Exterior1st      1460 non-null object\n",
      "Exterior2nd      1460 non-null object\n",
      "MasVnrType       1452 non-null object\n",
      "MasVnrArea       1452 non-null float64\n",
      "ExterQual        1460 non-null object\n",
      "ExterCond        1460 non-null object\n",
      "Foundation       1460 non-null object\n",
      "BsmtQual         1423 non-null object\n",
      "BsmtCond         1423 non-null object\n",
      "BsmtExposure     1422 non-null object\n",
      "BsmtFinType1     1423 non-null object\n",
      "BsmtFinSF1       1460 non-null int64\n",
      "BsmtFinType2     1422 non-null object\n",
      "BsmtFinSF2       1460 non-null int64\n",
      "BsmtUnfSF        1460 non-null int64\n",
      "TotalBsmtSF      1460 non-null int64\n",
      "Heating          1460 non-null object\n",
      "HeatingQC        1460 non-null object\n",
      "CentralAir       1460 non-null object\n",
      "Electrical       1459 non-null object\n",
      "1stFlrSF         1460 non-null int64\n",
      "2ndFlrSF         1460 non-null int64\n",
      "LowQualFinSF     1460 non-null int64\n",
      "GrLivArea        1460 non-null int64\n",
      "BsmtFullBath     1460 non-null int64\n",
      "BsmtHalfBath     1460 non-null int64\n",
      "FullBath         1460 non-null int64\n",
      "HalfBath         1460 non-null int64\n",
      "BedroomAbvGr     1460 non-null int64\n",
      "KitchenAbvGr     1460 non-null int64\n",
      "KitchenQual      1460 non-null object\n",
      "TotRmsAbvGrd     1460 non-null int64\n",
      "Functional       1460 non-null object\n",
      "Fireplaces       1460 non-null int64\n",
      "FireplaceQu      770 non-null object\n",
      "GarageType       1379 non-null object\n",
      "GarageYrBlt      1379 non-null float64\n",
      "GarageFinish     1379 non-null object\n",
      "GarageCars       1460 non-null int64\n",
      "GarageArea       1460 non-null int64\n",
      "GarageQual       1379 non-null object\n",
      "GarageCond       1379 non-null object\n",
      "PavedDrive       1460 non-null object\n",
      "WoodDeckSF       1460 non-null int64\n",
      "OpenPorchSF      1460 non-null int64\n",
      "EnclosedPorch    1460 non-null int64\n",
      "3SsnPorch        1460 non-null int64\n",
      "ScreenPorch      1460 non-null int64\n",
      "PoolArea         1460 non-null int64\n",
      "PoolQC           7 non-null object\n",
      "Fence            281 non-null object\n",
      "MiscFeature      54 non-null object\n",
      "MiscVal          1460 non-null int64\n",
      "MoSold           1460 non-null int64\n",
      "YrSold           1460 non-null int64\n",
      "SaleType         1460 non-null object\n",
      "SaleCondition    1460 non-null object\n",
      "SalePrice        1460 non-null int64\n",
      "dtypes: float64(3), int64(35), object(43)\n",
      "memory usage: 924.0+ KB\n"
     ]
    }
   ],
   "source": [
    "train.info()\n",
    "#查看测试数据的基本信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>OverallQual</th>\n",
       "      <th>OverallCond</th>\n",
       "      <th>YearBuilt</th>\n",
       "      <th>YearRemodAdd</th>\n",
       "      <th>MasVnrArea</th>\n",
       "      <th>BsmtFinSF1</th>\n",
       "      <th>...</th>\n",
       "      <th>WoodDeckSF</th>\n",
       "      <th>OpenPorchSF</th>\n",
       "      <th>EnclosedPorch</th>\n",
       "      <th>3SsnPorch</th>\n",
       "      <th>ScreenPorch</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "      <th>SalePrice</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1201.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1452.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>730.500000</td>\n",
       "      <td>56.897260</td>\n",
       "      <td>70.049958</td>\n",
       "      <td>10516.828082</td>\n",
       "      <td>6.099315</td>\n",
       "      <td>5.575342</td>\n",
       "      <td>1971.267808</td>\n",
       "      <td>1984.865753</td>\n",
       "      <td>103.685262</td>\n",
       "      <td>443.639726</td>\n",
       "      <td>...</td>\n",
       "      <td>94.244521</td>\n",
       "      <td>46.660274</td>\n",
       "      <td>21.954110</td>\n",
       "      <td>3.409589</td>\n",
       "      <td>15.060959</td>\n",
       "      <td>2.758904</td>\n",
       "      <td>43.489041</td>\n",
       "      <td>6.321918</td>\n",
       "      <td>2007.815753</td>\n",
       "      <td>180921.195890</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>421.610009</td>\n",
       "      <td>42.300571</td>\n",
       "      <td>24.284752</td>\n",
       "      <td>9981.264932</td>\n",
       "      <td>1.382997</td>\n",
       "      <td>1.112799</td>\n",
       "      <td>30.202904</td>\n",
       "      <td>20.645407</td>\n",
       "      <td>181.066207</td>\n",
       "      <td>456.098091</td>\n",
       "      <td>...</td>\n",
       "      <td>125.338794</td>\n",
       "      <td>66.256028</td>\n",
       "      <td>61.119149</td>\n",
       "      <td>29.317331</td>\n",
       "      <td>55.757415</td>\n",
       "      <td>40.177307</td>\n",
       "      <td>496.123024</td>\n",
       "      <td>2.703626</td>\n",
       "      <td>1.328095</td>\n",
       "      <td>79442.502883</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>20.000000</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>1300.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1872.000000</td>\n",
       "      <td>1950.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>2006.000000</td>\n",
       "      <td>34900.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>365.750000</td>\n",
       "      <td>20.000000</td>\n",
       "      <td>59.000000</td>\n",
       "      <td>7553.500000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>1954.000000</td>\n",
       "      <td>1967.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>2007.000000</td>\n",
       "      <td>129975.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>730.500000</td>\n",
       "      <td>50.000000</td>\n",
       "      <td>69.000000</td>\n",
       "      <td>9478.500000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>1973.000000</td>\n",
       "      <td>1994.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>383.500000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>25.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>2008.000000</td>\n",
       "      <td>163000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>1095.250000</td>\n",
       "      <td>70.000000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>11601.500000</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>2000.000000</td>\n",
       "      <td>2004.000000</td>\n",
       "      <td>166.000000</td>\n",
       "      <td>712.250000</td>\n",
       "      <td>...</td>\n",
       "      <td>168.000000</td>\n",
       "      <td>68.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>2009.000000</td>\n",
       "      <td>214000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1460.000000</td>\n",
       "      <td>190.000000</td>\n",
       "      <td>313.000000</td>\n",
       "      <td>215245.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>2010.000000</td>\n",
       "      <td>2010.000000</td>\n",
       "      <td>1600.000000</td>\n",
       "      <td>5644.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>857.000000</td>\n",
       "      <td>547.000000</td>\n",
       "      <td>552.000000</td>\n",
       "      <td>508.000000</td>\n",
       "      <td>480.000000</td>\n",
       "      <td>738.000000</td>\n",
       "      <td>15500.000000</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>2010.000000</td>\n",
       "      <td>755000.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8 rows × 38 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \\\n",
       "count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   \n",
       "mean    730.500000    56.897260    70.049958   10516.828082     6.099315   \n",
       "std     421.610009    42.300571    24.284752    9981.264932     1.382997   \n",
       "min       1.000000    20.000000    21.000000    1300.000000     1.000000   \n",
       "25%     365.750000    20.000000    59.000000    7553.500000     5.000000   \n",
       "50%     730.500000    50.000000    69.000000    9478.500000     6.000000   \n",
       "75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   \n",
       "max    1460.000000   190.000000   313.000000  215245.000000    10.000000   \n",
       "\n",
       "       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  \\\n",
       "count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000   \n",
       "mean      5.575342  1971.267808   1984.865753   103.685262   443.639726   \n",
       "std       1.112799    30.202904     20.645407   181.066207   456.098091   \n",
       "min       1.000000  1872.000000   1950.000000     0.000000     0.000000   \n",
       "25%       5.000000  1954.000000   1967.000000     0.000000     0.000000   \n",
       "50%       5.000000  1973.000000   1994.000000     0.000000   383.500000   \n",
       "75%       6.000000  2000.000000   2004.000000   166.000000   712.250000   \n",
       "max       9.000000  2010.000000   2010.000000  1600.000000  5644.000000   \n",
       "\n",
       "           ...         WoodDeckSF  OpenPorchSF  EnclosedPorch    3SsnPorch  \\\n",
       "count      ...        1460.000000  1460.000000    1460.000000  1460.000000   \n",
       "mean       ...          94.244521    46.660274      21.954110     3.409589   \n",
       "std        ...         125.338794    66.256028      61.119149    29.317331   \n",
       "min        ...           0.000000     0.000000       0.000000     0.000000   \n",
       "25%        ...           0.000000     0.000000       0.000000     0.000000   \n",
       "50%        ...           0.000000    25.000000       0.000000     0.000000   \n",
       "75%        ...         168.000000    68.000000       0.000000     0.000000   \n",
       "max        ...         857.000000   547.000000     552.000000   508.000000   \n",
       "\n",
       "       ScreenPorch     PoolArea       MiscVal       MoSold       YrSold  \\\n",
       "count  1460.000000  1460.000000   1460.000000  1460.000000  1460.000000   \n",
       "mean     15.060959     2.758904     43.489041     6.321918  2007.815753   \n",
       "std      55.757415    40.177307    496.123024     2.703626     1.328095   \n",
       "min       0.000000     0.000000      0.000000     1.000000  2006.000000   \n",
       "25%       0.000000     0.000000      0.000000     5.000000  2007.000000   \n",
       "50%       0.000000     0.000000      0.000000     6.000000  2008.000000   \n",
       "75%       0.000000     0.000000      0.000000     8.000000  2009.000000   \n",
       "max     480.000000   738.000000  15500.000000    12.000000  2010.000000   \n",
       "\n",
       "           SalePrice  \n",
       "count    1460.000000  \n",
       "mean   180921.195890  \n",
       "std     79442.502883  \n",
       "min     34900.000000  \n",
       "25%    129975.000000  \n",
       "50%    163000.000000  \n",
       "75%    214000.000000  \n",
       "max    755000.000000  \n",
       "\n",
       "[8 rows x 38 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.describe()\n",
    "#对数据值型特征，用常用统计量观察其分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>OverallQual</th>\n",
       "      <th>OverallCond</th>\n",
       "      <th>YearBuilt</th>\n",
       "      <th>YearRemodAdd</th>\n",
       "      <th>MasVnrArea</th>\n",
       "      <th>BsmtFinSF1</th>\n",
       "      <th>...</th>\n",
       "      <th>GarageArea</th>\n",
       "      <th>WoodDeckSF</th>\n",
       "      <th>OpenPorchSF</th>\n",
       "      <th>EnclosedPorch</th>\n",
       "      <th>3SsnPorch</th>\n",
       "      <th>ScreenPorch</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1459.000000</td>\n",
       "      <td>1459.000000</td>\n",
       "      <td>1232.000000</td>\n",
       "      <td>1459.000000</td>\n",
       "      <td>1459.000000</td>\n",
       "      <td>1459.000000</td>\n",
       "      <td>1459.000000</td>\n",
       "      <td>1459.000000</td>\n",
       "      <td>1444.000000</td>\n",
       "      <td>1458.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>1458.000000</td>\n",
       "      <td>1459.000000</td>\n",
       "      <td>1459.000000</td>\n",
       "      <td>1459.000000</td>\n",
       "      <td>1459.000000</td>\n",
       "      <td>1459.000000</td>\n",
       "      <td>1459.000000</td>\n",
       "      <td>1459.000000</td>\n",
       "      <td>1459.000000</td>\n",
       "      <td>1459.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>2190.000000</td>\n",
       "      <td>57.378341</td>\n",
       "      <td>68.580357</td>\n",
       "      <td>9819.161069</td>\n",
       "      <td>6.078821</td>\n",
       "      <td>5.553804</td>\n",
       "      <td>1971.357779</td>\n",
       "      <td>1983.662783</td>\n",
       "      <td>100.709141</td>\n",
       "      <td>439.203704</td>\n",
       "      <td>...</td>\n",
       "      <td>472.768861</td>\n",
       "      <td>93.174777</td>\n",
       "      <td>48.313914</td>\n",
       "      <td>24.243317</td>\n",
       "      <td>1.794380</td>\n",
       "      <td>17.064428</td>\n",
       "      <td>1.744345</td>\n",
       "      <td>58.167923</td>\n",
       "      <td>6.104181</td>\n",
       "      <td>2007.769705</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>421.321334</td>\n",
       "      <td>42.746880</td>\n",
       "      <td>22.376841</td>\n",
       "      <td>4955.517327</td>\n",
       "      <td>1.436812</td>\n",
       "      <td>1.113740</td>\n",
       "      <td>30.390071</td>\n",
       "      <td>21.130467</td>\n",
       "      <td>177.625900</td>\n",
       "      <td>455.268042</td>\n",
       "      <td>...</td>\n",
       "      <td>217.048611</td>\n",
       "      <td>127.744882</td>\n",
       "      <td>68.883364</td>\n",
       "      <td>67.227765</td>\n",
       "      <td>20.207842</td>\n",
       "      <td>56.609763</td>\n",
       "      <td>30.491646</td>\n",
       "      <td>630.806978</td>\n",
       "      <td>2.722432</td>\n",
       "      <td>1.301740</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1461.000000</td>\n",
       "      <td>20.000000</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>1470.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1879.000000</td>\n",
       "      <td>1950.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>2006.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1825.500000</td>\n",
       "      <td>20.000000</td>\n",
       "      <td>58.000000</td>\n",
       "      <td>7391.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>1953.000000</td>\n",
       "      <td>1963.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>318.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>2007.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>2190.000000</td>\n",
       "      <td>50.000000</td>\n",
       "      <td>67.000000</td>\n",
       "      <td>9399.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>1973.000000</td>\n",
       "      <td>1992.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>350.500000</td>\n",
       "      <td>...</td>\n",
       "      <td>480.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>28.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>2008.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>2554.500000</td>\n",
       "      <td>70.000000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>11517.500000</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>2001.000000</td>\n",
       "      <td>2004.000000</td>\n",
       "      <td>164.000000</td>\n",
       "      <td>753.500000</td>\n",
       "      <td>...</td>\n",
       "      <td>576.000000</td>\n",
       "      <td>168.000000</td>\n",
       "      <td>72.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>2009.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>2919.000000</td>\n",
       "      <td>190.000000</td>\n",
       "      <td>200.000000</td>\n",
       "      <td>56600.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>2010.000000</td>\n",
       "      <td>2010.000000</td>\n",
       "      <td>1290.000000</td>\n",
       "      <td>4010.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>1488.000000</td>\n",
       "      <td>1424.000000</td>\n",
       "      <td>742.000000</td>\n",
       "      <td>1012.000000</td>\n",
       "      <td>360.000000</td>\n",
       "      <td>576.000000</td>\n",
       "      <td>800.000000</td>\n",
       "      <td>17000.000000</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>2010.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8 rows × 37 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                Id   MSSubClass  LotFrontage       LotArea  OverallQual  \\\n",
       "count  1459.000000  1459.000000  1232.000000   1459.000000  1459.000000   \n",
       "mean   2190.000000    57.378341    68.580357   9819.161069     6.078821   \n",
       "std     421.321334    42.746880    22.376841   4955.517327     1.436812   \n",
       "min    1461.000000    20.000000    21.000000   1470.000000     1.000000   \n",
       "25%    1825.500000    20.000000    58.000000   7391.000000     5.000000   \n",
       "50%    2190.000000    50.000000    67.000000   9399.000000     6.000000   \n",
       "75%    2554.500000    70.000000    80.000000  11517.500000     7.000000   \n",
       "max    2919.000000   190.000000   200.000000  56600.000000    10.000000   \n",
       "\n",
       "       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  \\\n",
       "count  1459.000000  1459.000000   1459.000000  1444.000000  1458.000000   \n",
       "mean      5.553804  1971.357779   1983.662783   100.709141   439.203704   \n",
       "std       1.113740    30.390071     21.130467   177.625900   455.268042   \n",
       "min       1.000000  1879.000000   1950.000000     0.000000     0.000000   \n",
       "25%       5.000000  1953.000000   1963.000000     0.000000     0.000000   \n",
       "50%       5.000000  1973.000000   1992.000000     0.000000   350.500000   \n",
       "75%       6.000000  2001.000000   2004.000000   164.000000   753.500000   \n",
       "max       9.000000  2010.000000   2010.000000  1290.000000  4010.000000   \n",
       "\n",
       "          ...        GarageArea   WoodDeckSF  OpenPorchSF  EnclosedPorch  \\\n",
       "count     ...       1458.000000  1459.000000  1459.000000    1459.000000   \n",
       "mean      ...        472.768861    93.174777    48.313914      24.243317   \n",
       "std       ...        217.048611   127.744882    68.883364      67.227765   \n",
       "min       ...          0.000000     0.000000     0.000000       0.000000   \n",
       "25%       ...        318.000000     0.000000     0.000000       0.000000   \n",
       "50%       ...        480.000000     0.000000    28.000000       0.000000   \n",
       "75%       ...        576.000000   168.000000    72.000000       0.000000   \n",
       "max       ...       1488.000000  1424.000000   742.000000    1012.000000   \n",
       "\n",
       "         3SsnPorch  ScreenPorch     PoolArea       MiscVal       MoSold  \\\n",
       "count  1459.000000  1459.000000  1459.000000   1459.000000  1459.000000   \n",
       "mean      1.794380    17.064428     1.744345     58.167923     6.104181   \n",
       "std      20.207842    56.609763    30.491646    630.806978     2.722432   \n",
       "min       0.000000     0.000000     0.000000      0.000000     1.000000   \n",
       "25%       0.000000     0.000000     0.000000      0.000000     4.000000   \n",
       "50%       0.000000     0.000000     0.000000      0.000000     6.000000   \n",
       "75%       0.000000     0.000000     0.000000      0.000000     8.000000   \n",
       "max     360.000000   576.000000   800.000000  17000.000000    12.000000   \n",
       "\n",
       "            YrSold  \n",
       "count  1459.000000  \n",
       "mean   2007.769705  \n",
       "std       1.301740  \n",
       "min    2006.000000  \n",
       "25%    2007.000000  \n",
       "50%    2008.000000  \n",
       "75%    2009.000000  \n",
       "max    2010.000000  \n",
       "\n",
       "[8 rows x 37 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.describe()\n",
    "#对数据值型特征，用常用统计量观察其分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "MSZoning属性的不同取值和出现的次数\n",
      "RL         1151\n",
      "RM          218\n",
      "FV           65\n",
      "RH           16\n",
      "C (all)      10\n",
      "Name: MSZoning, dtype: int64\n",
      "\n",
      "Street属性的不同取值和出现的次数\n",
      "Pave    1454\n",
      "Grvl       6\n",
      "Name: Street, dtype: int64\n",
      "\n",
      "Alley属性的不同取值和出现的次数\n",
      "Grvl    50\n",
      "Pave    41\n",
      "Name: Alley, dtype: int64\n",
      "\n",
      "LotShape属性的不同取值和出现的次数\n",
      "Reg    925\n",
      "IR1    484\n",
      "IR2     41\n",
      "IR3     10\n",
      "Name: LotShape, dtype: int64\n",
      "\n",
      "LandContour属性的不同取值和出现的次数\n",
      "Lvl    1311\n",
      "Bnk      63\n",
      "HLS      50\n",
      "Low      36\n",
      "Name: LandContour, dtype: int64\n",
      "\n",
      "Utilities属性的不同取值和出现的次数\n",
      "AllPub    1459\n",
      "NoSeWa       1\n",
      "Name: Utilities, dtype: int64\n",
      "\n",
      "LotConfig属性的不同取值和出现的次数\n",
      "Inside     1052\n",
      "Corner      263\n",
      "CulDSac      94\n",
      "FR2          47\n",
      "FR3           4\n",
      "Name: LotConfig, dtype: int64\n",
      "\n",
      "LandSlope属性的不同取值和出现的次数\n",
      "Gtl    1382\n",
      "Mod      65\n",
      "Sev      13\n",
      "Name: LandSlope, dtype: int64\n",
      "\n",
      "Neighborhood属性的不同取值和出现的次数\n",
      "NAmes      225\n",
      "CollgCr    150\n",
      "OldTown    113\n",
      "Edwards    100\n",
      "Somerst     86\n",
      "Gilbert     79\n",
      "NridgHt     77\n",
      "Sawyer      74\n",
      "NWAmes      73\n",
      "SawyerW     59\n",
      "BrkSide     58\n",
      "Crawfor     51\n",
      "Mitchel     49\n",
      "NoRidge     41\n",
      "Timber      38\n",
      "IDOTRR      37\n",
      "ClearCr     28\n",
      "SWISU       25\n",
      "StoneBr     25\n",
      "MeadowV     17\n",
      "Blmngtn     17\n",
      "BrDale      16\n",
      "Veenker     11\n",
      "NPkVill      9\n",
      "Blueste      2\n",
      "Name: Neighborhood, dtype: int64\n",
      "\n",
      "Condition1属性的不同取值和出现的次数\n",
      "Norm      1260\n",
      "Feedr       81\n",
      "Artery      48\n",
      "RRAn        26\n",
      "PosN        19\n",
      "RRAe        11\n",
      "PosA         8\n",
      "RRNn         5\n",
      "RRNe         2\n",
      "Name: Condition1, dtype: int64\n",
      "\n",
      "Condition2属性的不同取值和出现的次数\n",
      "Norm      1445\n",
      "Feedr        6\n",
      "RRNn         2\n",
      "Artery       2\n",
      "PosN         2\n",
      "RRAe         1\n",
      "PosA         1\n",
      "RRAn         1\n",
      "Name: Condition2, dtype: int64\n",
      "\n",
      "BldgType属性的不同取值和出现的次数\n",
      "1Fam      1220\n",
      "TwnhsE     114\n",
      "Duplex      52\n",
      "Twnhs       43\n",
      "2fmCon      31\n",
      "Name: BldgType, dtype: int64\n",
      "\n",
      "HouseStyle属性的不同取值和出现的次数\n",
      "1Story    726\n",
      "2Story    445\n",
      "1.5Fin    154\n",
      "SLvl       65\n",
      "SFoyer     37\n",
      "1.5Unf     14\n",
      "2.5Unf     11\n",
      "2.5Fin      8\n",
      "Name: HouseStyle, dtype: int64\n",
      "\n",
      "RoofStyle属性的不同取值和出现的次数\n",
      "Gable      1141\n",
      "Hip         286\n",
      "Flat         13\n",
      "Gambrel      11\n",
      "Mansard       7\n",
      "Shed          2\n",
      "Name: RoofStyle, dtype: int64\n",
      "\n",
      "RoofMatl属性的不同取值和出现的次数\n",
      "CompShg    1434\n",
      "Tar&Grv      11\n",
      "WdShngl       6\n",
      "WdShake       5\n",
      "Roll          1\n",
      "ClyTile       1\n",
      "Membran       1\n",
      "Metal         1\n",
      "Name: RoofMatl, dtype: int64\n",
      "\n",
      "Exterior1st属性的不同取值和出现的次数\n",
      "VinylSd    515\n",
      "HdBoard    222\n",
      "MetalSd    220\n",
      "Wd Sdng    206\n",
      "Plywood    108\n",
      "CemntBd     61\n",
      "BrkFace     50\n",
      "WdShing     26\n",
      "Stucco      25\n",
      "AsbShng     20\n",
      "Stone        2\n",
      "BrkComm      2\n",
      "AsphShn      1\n",
      "CBlock       1\n",
      "ImStucc      1\n",
      "Name: Exterior1st, dtype: int64\n",
      "\n",
      "Exterior2nd属性的不同取值和出现的次数\n",
      "VinylSd    504\n",
      "MetalSd    214\n",
      "HdBoard    207\n",
      "Wd Sdng    197\n",
      "Plywood    142\n",
      "CmentBd     60\n",
      "Wd Shng     38\n",
      "Stucco      26\n",
      "BrkFace     25\n",
      "AsbShng     20\n",
      "ImStucc     10\n",
      "Brk Cmn      7\n",
      "Stone        5\n",
      "AsphShn      3\n",
      "CBlock       1\n",
      "Other        1\n",
      "Name: Exterior2nd, dtype: int64\n",
      "\n",
      "MasVnrType属性的不同取值和出现的次数\n",
      "None       864\n",
      "BrkFace    445\n",
      "Stone      128\n",
      "BrkCmn      15\n",
      "Name: MasVnrType, dtype: int64\n",
      "\n",
      "ExterQual属性的不同取值和出现的次数\n",
      "TA    906\n",
      "Gd    488\n",
      "Ex     52\n",
      "Fa     14\n",
      "Name: ExterQual, dtype: int64\n",
      "\n",
      "ExterCond属性的不同取值和出现的次数\n",
      "TA    1282\n",
      "Gd     146\n",
      "Fa      28\n",
      "Ex       3\n",
      "Po       1\n",
      "Name: ExterCond, dtype: int64\n",
      "\n",
      "Foundation属性的不同取值和出现的次数\n",
      "PConc     647\n",
      "CBlock    634\n",
      "BrkTil    146\n",
      "Slab       24\n",
      "Stone       6\n",
      "Wood        3\n",
      "Name: Foundation, dtype: int64\n",
      "\n",
      "BsmtQual属性的不同取值和出现的次数\n",
      "TA    649\n",
      "Gd    618\n",
      "Ex    121\n",
      "Fa     35\n",
      "Name: BsmtQual, dtype: int64\n",
      "\n",
      "BsmtCond属性的不同取值和出现的次数\n",
      "TA    1311\n",
      "Gd      65\n",
      "Fa      45\n",
      "Po       2\n",
      "Name: BsmtCond, dtype: int64\n",
      "\n",
      "BsmtExposure属性的不同取值和出现的次数\n",
      "No    953\n",
      "Av    221\n",
      "Gd    134\n",
      "Mn    114\n",
      "Name: BsmtExposure, dtype: int64\n",
      "\n",
      "BsmtFinType1属性的不同取值和出现的次数\n",
      "Unf    430\n",
      "GLQ    418\n",
      "ALQ    220\n",
      "BLQ    148\n",
      "Rec    133\n",
      "LwQ     74\n",
      "Name: BsmtFinType1, dtype: int64\n",
      "\n",
      "BsmtFinType2属性的不同取值和出现的次数\n",
      "Unf    1256\n",
      "Rec      54\n",
      "LwQ      46\n",
      "BLQ      33\n",
      "ALQ      19\n",
      "GLQ      14\n",
      "Name: BsmtFinType2, dtype: int64\n",
      "\n",
      "Heating属性的不同取值和出现的次数\n",
      "GasA     1428\n",
      "GasW       18\n",
      "Grav        7\n",
      "Wall        4\n",
      "OthW        2\n",
      "Floor       1\n",
      "Name: Heating, dtype: int64\n",
      "\n",
      "HeatingQC属性的不同取值和出现的次数\n",
      "Ex    741\n",
      "TA    428\n",
      "Gd    241\n",
      "Fa     49\n",
      "Po      1\n",
      "Name: HeatingQC, dtype: int64\n",
      "\n",
      "CentralAir属性的不同取值和出现的次数\n",
      "Y    1365\n",
      "N      95\n",
      "Name: CentralAir, dtype: int64\n",
      "\n",
      "Electrical属性的不同取值和出现的次数\n",
      "SBrkr    1334\n",
      "FuseA      94\n",
      "FuseF      27\n",
      "FuseP       3\n",
      "Mix         1\n",
      "Name: Electrical, dtype: int64\n",
      "\n",
      "KitchenQual属性的不同取值和出现的次数\n",
      "TA    735\n",
      "Gd    586\n",
      "Ex    100\n",
      "Fa     39\n",
      "Name: KitchenQual, dtype: int64\n",
      "\n",
      "Functional属性的不同取值和出现的次数\n",
      "Typ     1360\n",
      "Min2      34\n",
      "Min1      31\n",
      "Mod       15\n",
      "Maj1      14\n",
      "Maj2       5\n",
      "Sev        1\n",
      "Name: Functional, dtype: int64\n",
      "\n",
      "FireplaceQu属性的不同取值和出现的次数\n",
      "Gd    380\n",
      "TA    313\n",
      "Fa     33\n",
      "Ex     24\n",
      "Po     20\n",
      "Name: FireplaceQu, dtype: int64\n",
      "\n",
      "GarageType属性的不同取值和出现的次数\n",
      "Attchd     870\n",
      "Detchd     387\n",
      "BuiltIn     88\n",
      "Basment     19\n",
      "CarPort      9\n",
      "2Types       6\n",
      "Name: GarageType, dtype: int64\n",
      "\n",
      "GarageFinish属性的不同取值和出现的次数\n",
      "Unf    605\n",
      "RFn    422\n",
      "Fin    352\n",
      "Name: GarageFinish, dtype: int64\n",
      "\n",
      "GarageQual属性的不同取值和出现的次数\n",
      "TA    1311\n",
      "Fa      48\n",
      "Gd      14\n",
      "Ex       3\n",
      "Po       3\n",
      "Name: GarageQual, dtype: int64\n",
      "\n",
      "GarageCond属性的不同取值和出现的次数\n",
      "TA    1326\n",
      "Fa      35\n",
      "Gd       9\n",
      "Po       7\n",
      "Ex       2\n",
      "Name: GarageCond, dtype: int64\n",
      "\n",
      "PavedDrive属性的不同取值和出现的次数\n",
      "Y    1340\n",
      "N      90\n",
      "P      30\n",
      "Name: PavedDrive, dtype: int64\n",
      "\n",
      "PoolQC属性的不同取值和出现的次数\n",
      "Gd    3\n",
      "Ex    2\n",
      "Fa    2\n",
      "Name: PoolQC, dtype: int64\n",
      "\n",
      "Fence属性的不同取值和出现的次数\n",
      "MnPrv    157\n",
      "GdPrv     59\n",
      "GdWo      54\n",
      "MnWw      11\n",
      "Name: Fence, dtype: int64\n",
      "\n",
      "MiscFeature属性的不同取值和出现的次数\n",
      "Shed    49\n",
      "Gar2     2\n",
      "Othr     2\n",
      "TenC     1\n",
      "Name: MiscFeature, dtype: int64\n",
      "\n",
      "SaleType属性的不同取值和出现的次数\n",
      "WD       1267\n",
      "New       122\n",
      "COD        43\n",
      "ConLD       9\n",
      "ConLI       5\n",
      "ConLw       5\n",
      "CWD         4\n",
      "Oth         3\n",
      "Con         2\n",
      "Name: SaleType, dtype: int64\n",
      "\n",
      "SaleCondition属性的不同取值和出现的次数\n",
      "Normal     1198\n",
      "Partial     125\n",
      "Abnorml     101\n",
      "Family       20\n",
      "Alloca       12\n",
      "AdjLand       4\n",
      "Name: SaleCondition, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "categorical_features = train.select_dtypes(include = [\"object\"]).columns\n",
    "for col in categorical_features:\n",
    "    print ('\\n%s属性的不同取值和出现的次数'%col)\n",
    "    print (train[col].value_counts())\n",
    "#对类别型特征，观察其取值范围"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "发现大多数类别型特征的取值并不是很多，一些特征的取值有序关系，可以将其变换为有序数字\n",
    "其他的类别型特征可以用One hot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.drop(['Id'], inplace = True, axis = 1)\n",
    "test_id = test['Id']\n",
    "test.drop(['Id'], inplace = True, axis = 1)\n",
    "#删除掉ID那一列，因为ID只是用来标记样本的。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZsAAAEWCAYAAACwtjr+AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3Xu4XFWd5vHva7gqiQkQEEnaoEZanBkRjhDF8YJ2SFAJ7UAL7ZjIoOlm1NFH+1EYZwyi9qCPI92MiEZhCN4gokiaBuMRvLQOl5xwj4g5CJjTocnBAIniI4q/+WOvgk1R93N2XXa9n+epp/Ze+7LWTurUr9baa62tiMDMzKxIz+h1AczMrPwcbMzMrHAONmZmVjgHGzMzK5yDjZmZFc7BxszMCudgY1ZF0g8lvbPOtqslrSgo39MkPSDpN5L2KSKPTkl6raSJ3PomSa/tYZFswOzS6wKYTYWke4F3RsT3u5FfRCwt4rySdgU+CyyKiFuLyKPN8gSwMCLGa22PiJd0uUg24FyzMesP+wN7AJvaPVCZgfhbluQfuENqID6gZp2Q9C5J45K2S1on6bm5ba+UtEHSI+n9lXXOcYCk2yT9XVp/oolN0jsk/UTSZyQ9JOkeSUtzxx4k6ceSdkr6vqTzJH21Rh4vAu5Kqw9LurZZGVM5Pinpp8CjwPNrnPfFab+HU7PXcVXHvzO3/g5JP0nLP07Jt6YmvbfWOPe9kt6Qlp8h6XRJd0v6taS1kvZO2xZICkmnSvoVcK2kPSR9Ne37cLq2/Wv9+1t5ONhYKUk6GvhfwF8BBwD3AZekbXsD/wycC+xD1nz1z9X3SSQtAH4EfC4iPlMnqyPJAsW+wKeBCyQpbfs6cGPK40zg7bVOEBG/ACrNUrMj4ugWy/h2YCUwM11fvuy7Av8EfA/YD3gv8DVJB9e5jnx5Xp0WXxoRe0XEpU0O+W/A8cBrgOcCDwHnVe3zGuDFwDHACuDZwPx0bX8L/K5ZuWywOdhYWb0NuDAiboqI3wNnAK9IAeSNwOaI+EpE/DEivgH8HHhz7vhDgB8CqyJidYN87ouIL0XE48AassC2v6Q/A14OfDQiHouInwDr2ih/K2W8KCI2pe1/qDp+EbAXcHbK/1rgSuDkNsrQqr8BPhIRE+nf+kzghKomszMj4rcR8TvgD2RB5oUR8XhEbIyIHQWUy/qIg42V1XPJ/dqPiN8AvwYOrN6W3Je2VbwN+Ffgsib5/Fsuj0fT4l4pj+25NIAtnZa/Thkbne+5wJaI+FOD46fL84DLU5PYw8CdwONk96Eq8mX9CrAeuETSVkmfTjUxKzEHGyurrWRfggBIehbZr+l/rd6W/FnaVnEm8CDwdUkzOsj/fmBvSc/Mpc1v4/hWythoyvatwPyqjgP5438L5Mv2nDbKVm0LsDQiZudee0REzbJGxB8i4mMRcQjwSuBNwPIp5G8DwMHGymDXdNO58tqF7H7JKZIOlbQ78PfADRFxL3AV8CJJfy1pl3QD/BCyZqaKPwAnAs8CvtJub6+IuA8YA86UtJukV/DUJrBmWiljIzeQBZQPSdo1jYl5M+m+FXAL8BZJz5T0QuDUquMfoEangzq+AHxS0vMAJM2VtKzezpJeJ+nfpyC+g+zf+vEW87IB5WBjZXAV2Q3myuvMiLgG+J/At8hqGS8ATgKIiF+T/Zr+IFnT2oeAN0XEg/mTRsRjwFvIbrBf2EH34rcBr0h5fAK4FPh9Kwe2WsYGxz8GHAcsJauhfR5YHhE/T7ucAzxGFlTWAF+rOsWZwJrUNPZXTbL7R7L7Ud+TtBO4nqzjRD3PIWue3EHW5PYj4Gm99Kxc5IenmXWHpEuBn0fEql6XxazbXLMxK4ikl0t6QRqHsgRYBnyn1+Uy6wWP5jUrznOAb5N1TJgATouIm3tbJLPecDOamZkVzs1oZmZWODejJfvuu28sWLCg18UwMxsoGzdufDAi5jbbz8EmWbBgAWNjY70uhpnZQJFUPdNFTW5GMzOzwjnYmJlZ4RxszMyscA42ZmZWOAcbMzMrnIONmQ28WbNAevpr1qxel8wqHGzMbODt3NleunWfg42ZmRXOwcbMhoab23rHwcbMhoab23rHwcbMzArnYGNmA2/mzPbSrfs8EaeZDbwdO3pdAmvGNRszMyucg42ZDQ03t/WOm9HMbGi4ua13XLMxM7PCOdiYmVnhHGzMzKxwDjZmZla4woKNpIMl3ZJ77ZD0fkl7SxqVtDm9z0n7S9K5ksYl3SbpsNy5VqT9N0takUs/XNLt6ZhzJSml18zDzMx6o7BgExF3RcShEXEocDjwKHA5cDpwTUQsBK5J6wBLgYXptRI4H7LAAawCjgSOAFblgsf5ad/KcUtSer08zMysB7rVjPZ64O6IuA9YBqxJ6WuA49PyMuDiyFwPzJZ0AHAMMBoR2yPiIWAUWJK2zYqI6yIigIurzlUrDzMz64FuBZuTgG+k5f0j4n6A9L5fSj8Q2JI7ZiKlNUqfqJHeKI+nkLRS0pikscnJyQ4vzczMmik82EjaDTgO+GazXWukRQfpLYuI1RExEhEjc+fObedQMzNrQzdqNkuBmyLigbT+QGoCI71vS+kTwPzccfOArU3S59VIb5SHmZn1QDeCzck82YQGsA6o9ChbAVyRS1+eeqUtAh5JTWDrgcWS5qSOAYuB9WnbTkmLUi+05VXnqpWHmZn1QKFzo0l6JvAXwN/kks8G1ko6FfgVcGJKvwo4Fhgn67l2CkBEbJf0cWBD2u+siNielk8DLgL2BK5Or0Z5mJlZDyjryGUjIyMxNjbW62KYmQ0USRsjYqTZfp5BwMzMCudgY2ZmhXOwMTOzwjnYmJlZ4RxszMyscA42ZmZWOAcbMzMrnIONmZkVzsHGzMwK52BjZmaFc7AxM7PCOdiYmVnhHGzMrCWzZoH09NesWb0umQ0CBxsza8nOne2lm+U52JiZWeEcbMzMrHAONmb4foRZ0RxszPD9CLOiOdiYWUtmzmwv3Syv0GAjabakyyT9XNKdkl4haW9Jo5I2p/c5aV9JOlfSuKTbJB2WO8+KtP9mSSty6YdLuj0dc64kpfSaeZhZ53bsgIinv3bs6HXJbBAUXbP5R+C7EfHnwEuBO4HTgWsiYiFwTVoHWAosTK+VwPmQBQ5gFXAkcASwKhc8zk/7Vo5bktLr5WFmZj1QWLCRNAt4NXABQEQ8FhEPA8uANWm3NcDxaXkZcHFkrgdmSzoAOAYYjYjtEfEQMAosSdtmRcR1ERHAxVXnqpWHmZn1QJE1m+cDk8D/lXSzpC9Lehawf0TcD5De90v7HwhsyR0/kdIapU/USKdBHk8haaWkMUljk5OTnV+pDTzfjzArVpHBZhfgMOD8iHgZ8FsaN2epRlp0kN6yiFgdESMRMTJ37tx2DrWS6eb9CHeztmFUZLCZACYi4oa0fhlZ8HkgNYGR3rfl9p+fO34esLVJ+rwa6TTIw6zn3M3ahlFhwSYi/g3YIunglPR64GfAOqDSo2wFcEVaXgcsT73SFgGPpCaw9cBiSXNSx4DFwPq0baekRakX2vKqc9XKw8zMemCXgs//XuBrknYDfgmcQhbg1ko6FfgVcGLa9yrgWGAceDTtS0Rsl/RxYEPa76yI2J6WTwMuAvYErk4vgLPr5GFmZj2grCOXjYyMxNjYWK+LYUNAte42Jv5ztEEjaWNEjDTbzzMImJlZ4RxszLrM3axtGBV9z8bMqnh6FxtGrtmYVenWOBiPt7Fh4mBjVqVb42A83saGiYONmZkVzsHGzMwK52BjZmaFc7AxKyl3QLB+4mBjVqVb42CKzscdEKyfeJyNWZVujYPxeBsbJq7ZmJlZ4RxszMyscA42ZmZWOAcbs5LyhJ/WT9xBwKyk3AHB+olrNmZmVjgHGzMzK1yhwUbSvZJul3SLpLGUtrekUUmb0/uclC5J50oal3SbpMNy51mR9t8saUUu/fB0/vF0rBrlYcPNI+rNeqcbNZvXRcShuWdUnw5cExELgWvSOsBSYGF6rQTOhyxwAKuAI4EjgFW54HF+2rdy3JImedgQ84h6s97pRTPaMmBNWl4DHJ9Lvzgy1wOzJR0AHAOMRsT2iHgIGAWWpG2zIuK6iAjg4qpz1crDzMx6oOhgE8D3JG2UtDKl7R8R9wOk9/1S+oHAltyxEymtUfpEjfRGeTyFpJWSxiSNTU5OdniJZmbWTNFdn4+KiK2S9gNGJf28wb6qkRYdpLcsIlYDqwFGRkbaOtas12bNqt0EOHOmuz1b/ym0ZhMRW9P7NuBysnsuD6QmMNL7trT7BDA/d/g8YGuT9Hk10mmQh1lp+B6UDZLCgo2kZ0maWVkGFgN3AOuASo+yFcAVaXkdsDz1SlsEPJKawNYDiyXNSR0DFgPr07adkhalXmjLq85VKw8bYh5Rb9Y7RTaj7Q9cnnoj7wJ8PSK+K2kDsFbSqcCvgBPT/lcBxwLjwKPAKQARsV3Sx4ENab+zImJ7Wj4NuAjYE7g6vQDOrpOHDTE3LXXOTXY2Vco6ctnIyEiMjY31uhhmLVOtu5bJdP9ZdzMvGyySNuaGttTlGQTMpoEHjJo15mBjpdDrL/te3Kz3PSgbJJ712UphGHtm+V6JDZKWazaSXiXplLQ8V9JBxRXLzMzKpKVgI2kV8GHgjJS0K/DVogpl1g29bnobJG6ys6lqtRntL4GXATdBNlizMobGbFANY9Nbp9xkZ1PVajPaY2myy4AnBmmaWdLoF75rTWatB5u1kr5INhPzu4DvA18qrlhm7el1M8+OHdl4k8qrHteabFi11IwWEZ+R9BfADuBg4KMRMVpoycxqaDSSfVAGF86aVb9ZyiP1raxaCjap59m/VAKMpD0lLYiIe4ssnFm1MtxnaVTWMlyfWS2tNqN9E/hTbv3xlGY2sHrd9FYE97CzftVqb7RdIuKxykpEPCZpt4LKZNYVZWyWcs3I+lWrNZtJScdVViQtAx4spkhmnen0V30RtYFmtSPXPmzYtFqz+Vvga5I+R/aEzC1kz48x6xud/qovojawY0fjmZKnOz+zftdqb7S7gUWS9iJ7LIH/LKwnZs6s31ur0Ze11P0eXc3K1M4xg3wfyQyaBBtJ/zkivirpA1XpAETEZwssm9nTNAoWzWoS3ag5tNJ1uVE5y3gfyQya12wqMwX4d5VZC3p9g75RbarR+B6zojUMNhHxRUkzgB0RcU6XymRmHWp0r8j3hKyXmvZGi4jHgeOa7WfWbdW9yDrV6rxmU+01VilvJ+UwG3Stdn3+f5I+J+k/Sjqs8mrlQEkzJN0s6cq0fpCkGyRtlnRpZbyOpN3T+njaviB3jjNS+l2SjsmlL0lp45JOz6XXzMPKZbp+qVfPa1bU/GaNjotwE5eVW6vB5pXAS4CzgP+dXp9p8dj3AXfm1j8FnBMRC4GHgFNT+qnAQxHxQuCctB+SDgFOSvkvAT6fAtgM4DxgKXAIcHLat1EeNgDqjXupfrXLNQez3mkp2ETE62q8jm52nKR5wBuBL6d1AUcDl6Vd1gDHp+VlaZ20/fVp/2XAJRHx+4i4BxgHjkiv8Yj4ZZrd4BJgWZM8bIq6MXByOmostWoq3ag5lHEKHLPp0DDYSDpS0q2SfiPpOkkvbvP8/wB8iCfnVdsHeDgi/pjWJ4AD0/KBZINFSdsfSfs/kV51TL30RnlUX99KSWOSxiYnJ9u8tOHUTwMni5APip2o1yTX7fE97aSbdUOzms15wN+RfYF/lix4tETSm4BtEbExn1xj12iybbrSn54YsToiRiJiZO7cubV2sQHV6c386Qh+9Wpx3dIPAc+sWrNg84yIGE1NWN8E2vlGPgo4TtK9ZE1cR5MFq9mSKl2u5wFb0/IEMB8gbX82sD2fXnVMvfQHG+RhQ6aImlOtGkI+wHQya0AveaZo64ZmwWa2pLdUXjXW64qIMyJiXkQsILvBf21EvA34AXBC2m0FcEVaXpfWSduvTY+iXgeclHqrHQQsBG4ENgALU8+z3VIe69Ix9fKwPtSrGkC7GtUQWg0w/VjjGJQmThtszWYQ+BHw5jrrAXy7gzw/DFwi6RPAzcAFKf0C4CuSxslqNCcBRMQmSWuBnwF/BN6dxv4g6T3AemAGcGFEbGqSh5mZ9YBiUJ6lW7CRkZEYGxvrdTH6XqePLa53XDe08xFvVLNqdJ5Wa2T9+OfW6TWbAUjaGBEjzfZrqeuzpP0lXSDp6rR+iCSPXRlCnd58rhzX6/sTzbgnl1kxWh3UeRFZc9Vz0/ovgPcXUSArt27XbtoNEu0E03a7STtg2TBrNdjsGxFrSeNl0hiWxwsrldk0KPrmeyuBc+bM7nQEmEqPMtfmrBtafVLnbyXtQxqvImkR2aBLs8J18hCy6dLoHlUj3b7XMZUeZb3uDWfDodVg8wGyLsgvkPRTsvE2JzQ+xGzqKh0POukS3coxzTo2uFuw2fRo9bHQN0l6DXAw2Qj9uyLiD4WWzIZatx7hXB00etlrzqzMmj0Wut7AzRdJIiI6GWdjQ6xek9hUgku+yaqoGtB0nbNbQdSs3zSr2by5wbZOB3XaECvqi7YXNZJO7iW51mTDqtljoU/pVkFseHQyMLRRjQh68yXe6b2kIjT79zHrtVY7CCDpjWQPMNujkhYRZxVRKCu3dm66dzpjQTf0S6CB3v9bmDXT6gwCXwDeCryXrIPAicDzCiyXGeDeYGZl0fJjoSNiOdljmz8GvIKnTu9v1lA7o+2n+gCzfudp/G0YtRpsfpfeH5X0XLLZlw8qpkhWRu3URIap1jJM12rDrdVgc6Wk2cCngY3APWQPRDObVp3WZvr1Rnhlqppu8YPQrF81DDaSXi7pORHx8Yh4GNgLuB34JnBONwpog63y5VeE6geuVb7Y+ynwdLs50Pe4rF81q9l8EXgMQNKrgbNT2iPA6mKLZmXQrS+5nTuf/AXvL1az/tOs6/OMiNielt8KrI6IbwHfknRLsUUza1+ngaaXk32aDYNmNZsZkioB6fXAtbltLY/RMet3vQo0/dTkZ1akZsHmG8CPJF1B1iPtXwAkvZAmjxiQtIekGyXdKmmTpI+l9IMk3SBps6RLJe2W0ndP6+Np+4Lcuc5I6XdJOiaXviSljUs6PZdeMw+zfpNv/jMrs4bBJiI+CXyQ7Emdr4p4ol/NM8gGeDbye+DoiHgpcCiwJD0H51PAORGxEHgIqDxe+lSycTwvJOt88CnIHkENnEQ2e8ES4POSZkiaAZwHLAUOAU5O+9IgD7O+NF01Kz8IzfpV067PEXF9RFweEb/Npf0iIm5qclxExG/S6q7pFcDRwGUpfQ1wfFpeltZJ218vSSn9koj4fUTcA4wDR6TXeET8MiIeI+uKvSwdUy8PGwJT+WId9C/ldh5rXc3dpq1IrY6z6UiqgdwCbANGgbuBh9NjpQEmgAPT8oHAFnjisdOPAPvk06uOqZe+T4M8qsu3UtKYpLHJycmpXKoVrJ0uzTt2dB408l/Ww8bdpq1IhQabiHg8Ig4F5pHVRF5ca7f0Xms0Qkxjeq3yrY6IkYgYmTt3bq1drANFTTdTCQSt7tuJ6rE73eQahZVZocGmIg0I/SGwCJid6+E2D9ialidI862l7c8GtufTq46pl/5ggzxsGtVrdinyl3ArX8Bl+aJ2jcLKpLBgI2lumuIGSXsCbwDuBH4AnJB2WwFckZbXpXXS9mtTh4R1wEmpt9pBwELgRmADsDD1PNuNrBPBunRMvTxsGnXzy7DdQNaLL+rq+yRm9qQix8ocAKxJvcaeAayNiCsl/Qy4RNIngJuBC9L+FwBfkTROVqM5CSAiNklaC/yMbALQd0fE4wCS3gOsB2YAF0bEpnSuD9fJw6ww0lOfs9NKwCnrzNZm1RT+CQbAyMhIjI2N9boYA8VflLW18yfV7N+w25N49uuD6qx/SdoYESPN9vMsAGbTLB9ABumLelDKaYOpKx0EzIbVVO4dDfqYH7M8BxvrmL8MW9NpzzjXNKxMHGysY/4ybE2ldlOrq7jZsHCwsbo8fUl9ndTqPG7GhpmDjdXVaPqSYf9lnv+3KaI50U2UVjYONvY0RT7KuYx27px6cOhk4kyzQeJgY0/j5p72TXdwcBOmlY2Djdk0mO6aoGdgtrJxsLEnuPlsMLjWY4PIwcae4F/Ng8G1HhtEDjZmBat0HvAjm22YeW40sxbVm+esUdNjfiJN9zCzYeaajVkTveiO7FqQlY1rNmYN9OrLvVFgcycOG0Su2dgT/Ks50+7gyuoBmUU/rdO1HhtErtlY3YdmWX/yvR8bRK7ZDDkHGjPrhsKCjaT5kn4g6U5JmyS9L6XvLWlU0ub0PielS9K5ksYl3SbpsNy5VqT9N0takUs/XNLt6Zhzpaw1u14e9nQONE/VqCmqHwZT9kMZzDpRZM3mj8AHI+LFwCLg3ZIOAU4HromIhcA1aR1gKbAwvVYC50MWOIBVwJHAEcCqXPA4P+1bOW5JSq+Xh1ldzR7h3GgwZbeCwHQN6HTQsm4rLNhExP0RcVNa3gncCRwILAPWpN3WAMen5WXAxZG5Hpgt6QDgGGA0IrZHxEPAKLAkbZsVEddFRAAXV52rVh5mdU2lljdoo/oHrbw2+Lpyz0bSAuBlwA3A/hFxP2QBCdgv7XYgsCV32ERKa5Q+USOdBnlY4nnQzKybCg82kvYCvgW8PyIa9aOp9dUXHaS3U7aVksYkjU1OTrZz6EBzpwAz67ZCg42kXckCzdci4tsp+YHUBEZ635bSJ4D5ucPnAVubpM+rkd4oj6eIiNURMRIRI3Pnzu3sIgeQA41Zc76vNb2K7I0m4ALgzoj4bG7TOqDSo2wFcEUufXnqlbYIeCQ1ga0HFkuakzoGLAbWp207JS1KeS2vOletPErPfyDF8aDJ4eL7WtOryEGdRwFvB26XdEtK++/A2cBaSacCvwJOTNuuAo4FxoFHgVMAImK7pI8DG9J+Z0XE9rR8GnARsCdwdXrRII/Sa9Zjyn8o9TULJpWeau3c65ruADVzZu3/w3bzma7zmLVKUdScGgNmZGQkxsbGel2MKfNN//Z08vFv59/Yf16Dq9XZvIedpI0RMdJsP88gYEOtUbNjvW2tci3B7EkONjbUGjU7NmpyrEy02WhSTM9hZvYkB5uS8LiZ3tixo/Zszw40g8+za08vB5uS8I3/7nKPv/Ir8w+JXvRadbAZMFO9jzDMmjV9TZUDvg2KXnTrdrAZMP5Cmzr/G5p1nx+eZkPDNUCz3nHNxizH4yfMiuFgY2ZmhXOwGRDu2tw9nXYgcJdYGxS96NbtezZ9zMGl+2bNqt+1tdn/Rxm6xNpw6MVn1TUbs5ydO+uPPXDNxaxzDjZ9ygME+0elq7RrLmadc7DpUx4LYmZl4mDTR/KzA5iZlYmDTR+oBBnXZvqfJ2c064x7o/UBB5niVAZpTldt0fdtzDrjYGOl5iZJs/7gZjQbeDNnPnUKeDPrP4UFG0kXStom6Y5c2t6SRiVtTu9zUroknStpXNJtkg7LHbMi7b9Z0opc+uGSbk/HnCtlv2Hr5WHlU6bni5iVXZE1m4uAJVVppwPXRMRC4Jq0DrAUWJheK4HzIQscwCrgSOAIYFUueJyf9q0ct6RJHmZ1uUZkVqzCgk1E/BjYXpW8DFiTltcAx+fSL47M9cBsSQcAxwCjEbE9Ih4CRoEladusiLguIgK4uOpctfLoO57vzMyGRbfv2ewfEfcDpPf9UvqBwJbcfhMprVH6RI30Rnk8jaSVksYkjU1OTnZ8UZ2YNcu90JqZObN7XY0d9M2K1S8dBGr9qUcH6W2JiNURMRIRI3Pnzm338ClxoGlNJ8+Bn+5A5DE0ZlPX7WDzQGoCI71vS+kTwPzcfvOArU3S59VIb5SHDZhOA3K9ANVJ0HAHBLPp0e1gsw6o9ChbAVyRS1+eeqUtAh5JTWDrgcWS5qSOAYuB9WnbTkmLUi+05VXnqpWH9VA/1A7qBSEzK15hgzolfQN4LbCvpAmyXmVnA2slnQr8Cjgx7X4VcCwwDjwKnAIQEdslfRzYkPY7KyIqnQ5OI+vxtidwdXrRIA/rkZkzn6wd+F6V2XBS+KcdACMjIzE2Nta1/IbphnSzj1ijf4tufDx7nb/ZIJO0MSJGmu3XLx0ESi0/m7Nnde4/nlzTrHieG60LhrnZqJUv7Jkza/8bdevL3h0AzIrnYDPNhv2eRCfNTv6yNys/N6NNs7IHmm4OtDSz8nDNxp4Q4ZvlZlYMB5sey3+Bu+OAmZWVm9F6yM1OZjYsXLPpon5thurXcplZebhm06FaY2caNYMNSi2mUTml7LrNzNrlYNOhRr3Oak36uHPn9H9Z5x+HPJVz5FXmD6un7L3tzKwYbkYrSL0v5UZf1vUGN+a3dzImxc1kZtZrrtn0kWaBxIMfzWxQOdiYmVnhHGzMzKxwDjYd6rcpW6a7PP12fWY22NxBoEPN7p90OpNxp8dN9/0c3x8ys+nkYFOQTr+s/SVvZmXkZjQzMytcaYONpCWS7pI0Lun0XpfHzGyYlTLYSJoBnAcsBQ4BTpZ0SG9LZWY2vEoZbIAjgPGI+GVEPAZcAizrcZnMzIZWWYPNgcCW3PpESnsKSSsljUkam5yc7FrhzMyGTVl7o9Waf/lpM4RFxGpgNYCkSUn3FV2wLtsXeLDXheiCYbhOX2N5lO06n9fKTmUNNhPA/Nz6PGBrowMiYm6hJeoBSWMRMdLrchRtGK7T11gew3Kd1crajLYBWCjpIEm7AScB63pcJjOzoVXKmk1E/FHSe4D1wAzgwojY1ONimZkNrVIGG4CIuAq4qtfl6LHVvS5AlwzDdfoay2NYrvMpFH6ylpmZFays92zMzKyPONiYmVnhHGwGjKQLJW2TdEcubW9Jo5I2p/c5KV2Szk3zw90m6bDcMSvS/pslrejFtdQjab6kH0i6U9ImSe9L6aW5Tkl7SLpR0q3pGj+W0g+SdEMq76WpNyWSdk/r42n7gty5zkjpd0k6pjdXVJ+kGZJulnRlWi/jNd4r6XZJt0gaS2ml+bxOi4jwa4BewKuBw4A7cmmfBk5Py6cDn0rLxwJXkw1yXQTckNL3Bn6Z3uek5Tm9vrbc9RwAHJaWZwK/IJtFI9arAAAFZElEQVTjrjTXmcq6V1reFbghlX0tcFJK/wJwWlr+r8AX0vJJwKVp+RDgVmB34CDgbmBGr6+v6lo/AHwduDKtl/Ea7wX2rUorzed1Ol6u2QyYiPgxsL0qeRmwJi2vAY7PpV8cmeuB2ZIOAI4BRiNie0Q8BIwCS4ovfWsi4v6IuCkt7wTuJJtuqDTXmcr6m7S6a3oFcDRwWUqvvsbKtV8GvF6SUvolEfH7iLgHGCebG7AvSJoHvBH4cloXJbvGBkrzeZ0ODjblsH9E3A/ZFzWwX0qvN0dcS3PH9YPUlPIysl/+pbrO1Lx0C7CN7IvlbuDhiPhj2iVf3ieuJW1/BNiHPr9G4B+ADwF/Suv7UL5rhOyHwvckbZS0MqWV6vM6VaUdZ2NA/TniWpo7rtck7QV8C3h/ROzIfuTW3rVGWt9fZ0Q8DhwqaTZwOfDiWrul94G7RklvArZFxEZJr60k19h1YK8x56iI2CppP2BU0s8b7DvI19kx12zK4YFUDSe9b0vp9eaIa3vuuG6TtCtZoPlaRHw7JZfuOgEi4mHgh2Tt97MlVX4E5sv7xLWk7c8ma07t52s8CjhO0r1kj/k4mqymU6ZrBCAitqb3bWQ/HI6gpJ/XTjnYlMM6oNJzZQVwRS59eer9sgh4JFXn1wOLJc1JPWQWp7S+kNrpLwDujIjP5jaV5jolzU01GiTtCbyB7N7UD4AT0m7V11i59hOAayO7q7wOOCn15DoIWAjc2J2raCwizoiIeRGxgOyG/7UR8TZKdI0Akp4laWZlmexzdgcl+rxOi173UPCrvRfwDeB+4A9kv4ROJWvXvgbYnN73TvuK7ImldwO3AyO58/wXshut48Apvb6uqmt8FVnzwW3ALel1bJmuE/gPwM3pGu8APprSn0/2RToOfBPYPaXvkdbH0/bn5871kXTtdwFLe31tda73tTzZG61U15iu59b02gR8JKWX5vM6HS9PV2NmZoVzM5qZmRXOwcbMzArnYGNmZoVzsDEzs8I52JiZWeEcbMymgaT9JX1d0i/TlCXXSfrLGvstUG7G7lz6WZLe0EI+L5MU/TjzsVkjDjZmU5QGoX4H+HFEPD8iDicbxDivar+600NFxEcj4vstZHcy8JP0XrMskvx3bX3HH0qzqTsaeCwivlBJiIj7IuL/SHqHpG9K+ifge/VOIOkiSSdIWippbS79tenYSlA7AXgH2UjzPVL6AmXP/vk8cBMwX9LiVLu6KeW/V9r3o5I2SLpD0mo1mHDObDo52JhN3UvIvuTreQWwIiKObuFco8CiNO0JwFuBS9PyUcA9EXE32Vxqx+aOO5hs2vqXAb8F/gfwhog4DBgje6YMwOci4uUR8e+APYE3tVAmsylzsDGbZpLOU/YEzg0paTQiqp9BVFNkU+t/F3hzanZ7I0/OqXUy2YSWpPd8U9p9kT0bBbIJPQ8BfpoeYbACeF7a9jplT8G8naxG9pL2r9CsfX7EgNnUbQL+U2UlIt4taV+yGgVkNY12XAq8m2zG4w0RsVPSjJTHcZI+Qja/1j6VCSCr8hBZgHvKfZ3U7PZ5srm4tkg6k2w+MrPCuWZjNnXXAntIOi2X9swpnO+HZI/+fhdPNqG9Abg1IuZHxIKIeB7ZIxiOr3H89cBRkl4IIOmZkl7Ek4HlwXQP54Qax5oVwsHGbIoim832eOA1ku6RdCPZY4A/XOeQgyVN5F4nVp3vceBKYGl6h6zJ7PKq83wL+Osa5Zkk60TwDUm3kQWfP4/suTlfIptp+DvAhupjzYriWZ/NzKxwrtmYmVnhHGzMzKxwDjZmZlY4BxszMyucg42ZmRXOwcbMzArnYGNmZoX7//KqxPBztvolAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.scatter(train.GrLivArea, train.SalePrice, c = \"blue\", marker = \"s\")\n",
    "plt.title(\"Looking for outliers\")\n",
    "plt.xlabel(\"GrLivArea\")\n",
    "plt.ylabel(\"SalePrice\")\n",
    "plt.show()\n",
    "\n",
    "#离群点检测，剔除噪声数据。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 1456 entries, 0 to 1459\n",
      "Data columns (total 80 columns):\n",
      "MSSubClass       1456 non-null int64\n",
      "MSZoning         1456 non-null object\n",
      "LotFrontage      1197 non-null float64\n",
      "LotArea          1456 non-null int64\n",
      "Street           1456 non-null object\n",
      "Alley            91 non-null object\n",
      "LotShape         1456 non-null object\n",
      "LandContour      1456 non-null object\n",
      "Utilities        1456 non-null object\n",
      "LotConfig        1456 non-null object\n",
      "LandSlope        1456 non-null object\n",
      "Neighborhood     1456 non-null object\n",
      "Condition1       1456 non-null object\n",
      "Condition2       1456 non-null object\n",
      "BldgType         1456 non-null object\n",
      "HouseStyle       1456 non-null object\n",
      "OverallQual      1456 non-null int64\n",
      "OverallCond      1456 non-null int64\n",
      "YearBuilt        1456 non-null int64\n",
      "YearRemodAdd     1456 non-null int64\n",
      "RoofStyle        1456 non-null object\n",
      "RoofMatl         1456 non-null object\n",
      "Exterior1st      1456 non-null object\n",
      "Exterior2nd      1456 non-null object\n",
      "MasVnrType       1448 non-null object\n",
      "MasVnrArea       1448 non-null float64\n",
      "ExterQual        1456 non-null object\n",
      "ExterCond        1456 non-null object\n",
      "Foundation       1456 non-null object\n",
      "BsmtQual         1419 non-null object\n",
      "BsmtCond         1419 non-null object\n",
      "BsmtExposure     1418 non-null object\n",
      "BsmtFinType1     1419 non-null object\n",
      "BsmtFinSF1       1456 non-null int64\n",
      "BsmtFinType2     1418 non-null object\n",
      "BsmtFinSF2       1456 non-null int64\n",
      "BsmtUnfSF        1456 non-null int64\n",
      "TotalBsmtSF      1456 non-null int64\n",
      "Heating          1456 non-null object\n",
      "HeatingQC        1456 non-null object\n",
      "CentralAir       1456 non-null object\n",
      "Electrical       1455 non-null object\n",
      "1stFlrSF         1456 non-null int64\n",
      "2ndFlrSF         1456 non-null int64\n",
      "LowQualFinSF     1456 non-null int64\n",
      "GrLivArea        1456 non-null int64\n",
      "BsmtFullBath     1456 non-null int64\n",
      "BsmtHalfBath     1456 non-null int64\n",
      "FullBath         1456 non-null int64\n",
      "HalfBath         1456 non-null int64\n",
      "BedroomAbvGr     1456 non-null int64\n",
      "KitchenAbvGr     1456 non-null int64\n",
      "KitchenQual      1456 non-null object\n",
      "TotRmsAbvGrd     1456 non-null int64\n",
      "Functional       1456 non-null object\n",
      "Fireplaces       1456 non-null int64\n",
      "FireplaceQu      766 non-null object\n",
      "GarageType       1375 non-null object\n",
      "GarageYrBlt      1375 non-null float64\n",
      "GarageFinish     1375 non-null object\n",
      "GarageCars       1456 non-null int64\n",
      "GarageArea       1456 non-null int64\n",
      "GarageQual       1375 non-null object\n",
      "GarageCond       1375 non-null object\n",
      "PavedDrive       1456 non-null object\n",
      "WoodDeckSF       1456 non-null int64\n",
      "OpenPorchSF      1456 non-null int64\n",
      "EnclosedPorch    1456 non-null int64\n",
      "3SsnPorch        1456 non-null int64\n",
      "ScreenPorch      1456 non-null int64\n",
      "PoolArea         1456 non-null int64\n",
      "PoolQC           5 non-null object\n",
      "Fence            280 non-null object\n",
      "MiscFeature      54 non-null object\n",
      "MiscVal          1456 non-null int64\n",
      "MoSold           1456 non-null int64\n",
      "YrSold           1456 non-null int64\n",
      "SaleType         1456 non-null object\n",
      "SaleCondition    1456 non-null object\n",
      "SalePrice        1456 non-null int64\n",
      "dtypes: float64(3), int64(34), object(43)\n",
      "memory usage: 921.4+ KB\n"
     ]
    }
   ],
   "source": [
    "train = train[train.GrLivArea < 4000]\n",
    "temp = train.reindex()\n",
    "train.info()\n",
    "#发现有4个噪声点明显偏离，可以对噪声点进行删除。然后再看他的基本信息。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_missvalue_by_meaning (df):\n",
    "    # Alley : data description says NA means \"no alley access\"\n",
    "    df.loc[:, \"Alley\"] = df.loc[:, \"Alley\"].fillna(\"None\")\n",
    "\n",
    "    # BedroomAbvGr : NA most likely means 0\n",
    "    df.loc[:, \"BedroomAbvGr\"] = df.loc[:, \"BedroomAbvGr\"].fillna(0)\n",
    "\n",
    "    # BsmtQual etc : data description says NA for basement features is \"no basement\"\n",
    "    df.loc[:, \"BsmtQual\"] = df.loc[:, \"BsmtQual\"].fillna(\"No\")\n",
    "    df.loc[:, \"BsmtCond\"] = df.loc[:, \"BsmtCond\"].fillna(\"No\")\n",
    "    df.loc[:, \"BsmtExposure\"] = df.loc[:, \"BsmtExposure\"].fillna(\"No\")\n",
    "    df.loc[:, \"BsmtFinType1\"] = df.loc[:, \"BsmtFinType1\"].fillna(\"No\")\n",
    "    df.loc[:, \"BsmtFinType2\"] = df.loc[:, \"BsmtFinType2\"].fillna(\"No\")\n",
    "    df.loc[:, \"BsmtFullBath\"] = df.loc[:, \"BsmtFullBath\"].fillna(0)\n",
    "    df.loc[:, \"BsmtHalfBath\"] = df.loc[:, \"BsmtHalfBath\"].fillna(0)\n",
    "    df.loc[:, \"BsmtUnfSF\"] = df.loc[:, \"BsmtUnfSF\"].fillna(0)\n",
    "\n",
    "    # CentralAir : NA most likely means No\n",
    "    df.loc[:, \"CentralAir\"] = df.loc[:, \"CentralAir\"].fillna(\"N\")\n",
    "\n",
    "    # Condition : NA most likely means Normal，靠近主干道或铁路\n",
    "    df.loc[:, \"Condition1\"] = df.loc[:, \"Condition1\"].fillna(\"Norm\")\n",
    "    df.loc[:, \"Condition2\"] = df.loc[:, \"Condition2\"].fillna(\"Norm\")\n",
    "\n",
    "    # EnclosedPorch : NA most likely means no enclosed porch\n",
    "    df.loc[:, \"EnclosedPorch\"] = df.loc[:, \"EnclosedPorch\"].fillna(0)\n",
    "\n",
    "    # External stuff : NA most likely means average\n",
    "    df.loc[:, \"ExterCond\"] = df.loc[:, \"ExterCond\"].fillna(\"TA\")\n",
    "    df.loc[:, \"ExterQual\"] = df.loc[:, \"ExterQual\"].fillna(\"TA\")\n",
    "\n",
    "    # Fence : data description says NA means \"no fence\"\n",
    "    df.loc[:, \"Fence\"] = df.loc[:, \"Fence\"].fillna(\"No\")\n",
    "\n",
    "    # FireplaceQu : data description says NA means \"no fireplace\"\n",
    "    df.loc[:, \"FireplaceQu\"] = df.loc[:, \"FireplaceQu\"].fillna(\"No\")\n",
    "    df.loc[:, \"Fireplaces\"] = df.loc[:, \"Fireplaces\"].fillna(0)\n",
    "\n",
    "    # Functional : data description says NA means typical，家用（Home）功能性评级\n",
    "    df.loc[:, \"Functional\"] = df.loc[:, \"Functional\"].fillna(\"Typ\")\n",
    "\n",
    "    # GarageType etc : data description says NA for garage features is \"no garage\"\n",
    "    df.loc[:, \"GarageType\"] = df.loc[:, \"GarageType\"].fillna(\"No\")\n",
    "    df.loc[:, \"GarageFinish\"] = df.loc[:, \"GarageFinish\"].fillna(\"No\")\n",
    "    df.loc[:, \"GarageQual\"] = df.loc[:, \"GarageQual\"].fillna(\"No\")\n",
    "    df.loc[:, \"GarageCond\"] = df.loc[:, \"GarageCond\"].fillna(\"No\")\n",
    "    df.loc[:, \"GarageArea\"] = df.loc[:, \"GarageArea\"].fillna(0)\n",
    "    df.loc[:, \"GarageCars\"] = df.loc[:, \"GarageCars\"].fillna(0)\n",
    "\n",
    "    # HalfBath : NA most likely means no half baths above grade\n",
    "    df.loc[:, \"HalfBath\"] = df.loc[:, \"HalfBath\"].fillna(0)\n",
    "\n",
    "    # HeatingQC : NA most likely means typical\n",
    "    df.loc[:, \"HeatingQC\"] = df.loc[:, \"HeatingQC\"].fillna(\"TA\")\n",
    "\n",
    "    # KitchenAbvGr : NA most likely means 0\n",
    "    df.loc[:, \"KitchenAbvGr\"] = df.loc[:, \"KitchenAbvGr\"].fillna(0)\n",
    "\n",
    "    # KitchenQual : NA most likely means typical\n",
    "    df.loc[:, \"KitchenQual\"] = df.loc[:, \"KitchenQual\"].fillna(\"TA\")\n",
    "\n",
    "    # LotFrontage : NA most likely means no lot frontage\n",
    "    df.loc[:, \"LotFrontage\"] = df.loc[:, \"LotFrontage\"].fillna(0)\n",
    "\n",
    "    # LotShape : NA most likely means regular\n",
    "    df.loc[:, \"LotShape\"] = df.loc[:, \"LotShape\"].fillna(\"Reg\")\n",
    "\n",
    "    # MasVnrType : NA most likely means no veneer，表层砌体（Masonry veneer）类型\n",
    "    df.loc[:, \"MasVnrType\"] = df.loc[:, \"MasVnrType\"].fillna(\"None\")\n",
    "    df.loc[:, \"MasVnrArea\"] = df.loc[:, \"MasVnrArea\"].fillna(0)\n",
    "\n",
    "    # MiscFeature : data description says NA means \"no misc feature\"\n",
    "    df.loc[:, \"MiscFeature\"] = df.loc[:, \"MiscFeature\"].fillna(\"No\")\n",
    "    df.loc[:, \"MiscVal\"] = df.loc[:, \"MiscVal\"].fillna(0)\n",
    "\n",
    "    # OpenPorchSF : NA most likely means no open porch\n",
    "    df.loc[:, \"OpenPorchSF\"] = df.loc[:, \"OpenPorchSF\"].fillna(0)\n",
    "\n",
    "    # PavedDrive : NA most likely means not paved\n",
    "    df.loc[:, \"PavedDrive\"] = df.loc[:, \"PavedDrive\"].fillna(\"N\")\n",
    "\n",
    "    # PoolQC : data description says NA means \"no pool\"\n",
    "    df.loc[:, \"PoolQC\"] = df.loc[:, \"PoolQC\"].fillna(\"No\")\n",
    "    df.loc[:, \"PoolArea\"] = df.loc[:, \"PoolArea\"].fillna(0)\n",
    "\n",
    "    # SaleCondition : NA most likely means normal sale\n",
    "    df.loc[:, \"SaleCondition\"] = df.loc[:, \"SaleCondition\"].fillna(\"Normal\")\n",
    "\n",
    "    # ScreenPorch : NA most likely means no screen porch，观景门廊\n",
    "    df.loc[:, \"ScreenPorch\"] = df.loc[:, \"ScreenPorch\"].fillna(0)\n",
    "\n",
    "    # TotRmsAbvGrd : NA most likely means 0\n",
    "    df.loc[:, \"TotRmsAbvGrd\"] = df.loc[:, \"TotRmsAbvGrd\"].fillna(0)\n",
    "\n",
    "    # Utilities : NA most likely means all public utilities\n",
    "    df.loc[:, \"Utilities\"] = df.loc[:, \"Utilities\"].fillna(\"AllPub\")\n",
    "\n",
    "    # WoodDeckSF : NA most likely means no wood deck\n",
    "    df.loc[:, \"WoodDeckSF\"] = df.loc[:, \"WoodDeckSF\"].fillna(0)\n",
    "    \n",
    "    return df\n",
    "# 有些特征，用median/mean or most common value 填充没有意义\n",
    "# 因为特征工程对训练集和测试都需要进行，因此我们定义成函数，将数据集以参数形式传递\n",
    "train = process_missvalue_by_meaning(train)\n",
    "test = process_missvalue_by_meaning(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 1456 entries, 0 to 1459\n",
      "Data columns (total 80 columns):\n",
      "MSSubClass       1456 non-null int64\n",
      "MSZoning         1456 non-null object\n",
      "LotFrontage      1456 non-null float64\n",
      "LotArea          1456 non-null int64\n",
      "Street           1456 non-null object\n",
      "Alley            1456 non-null object\n",
      "LotShape         1456 non-null object\n",
      "LandContour      1456 non-null object\n",
      "Utilities        1456 non-null object\n",
      "LotConfig        1456 non-null object\n",
      "LandSlope        1456 non-null object\n",
      "Neighborhood     1456 non-null object\n",
      "Condition1       1456 non-null object\n",
      "Condition2       1456 non-null object\n",
      "BldgType         1456 non-null object\n",
      "HouseStyle       1456 non-null object\n",
      "OverallQual      1456 non-null int64\n",
      "OverallCond      1456 non-null int64\n",
      "YearBuilt        1456 non-null int64\n",
      "YearRemodAdd     1456 non-null int64\n",
      "RoofStyle        1456 non-null object\n",
      "RoofMatl         1456 non-null object\n",
      "Exterior1st      1456 non-null object\n",
      "Exterior2nd      1456 non-null object\n",
      "MasVnrType       1456 non-null object\n",
      "MasVnrArea       1456 non-null float64\n",
      "ExterQual        1456 non-null object\n",
      "ExterCond        1456 non-null object\n",
      "Foundation       1456 non-null object\n",
      "BsmtQual         1456 non-null object\n",
      "BsmtCond         1456 non-null object\n",
      "BsmtExposure     1456 non-null object\n",
      "BsmtFinType1     1456 non-null object\n",
      "BsmtFinSF1       1456 non-null int64\n",
      "BsmtFinType2     1456 non-null object\n",
      "BsmtFinSF2       1456 non-null int64\n",
      "BsmtUnfSF        1456 non-null int64\n",
      "TotalBsmtSF      1456 non-null int64\n",
      "Heating          1456 non-null object\n",
      "HeatingQC        1456 non-null object\n",
      "CentralAir       1456 non-null object\n",
      "Electrical       1455 non-null object\n",
      "1stFlrSF         1456 non-null int64\n",
      "2ndFlrSF         1456 non-null int64\n",
      "LowQualFinSF     1456 non-null int64\n",
      "GrLivArea        1456 non-null int64\n",
      "BsmtFullBath     1456 non-null int64\n",
      "BsmtHalfBath     1456 non-null int64\n",
      "FullBath         1456 non-null int64\n",
      "HalfBath         1456 non-null int64\n",
      "BedroomAbvGr     1456 non-null int64\n",
      "KitchenAbvGr     1456 non-null int64\n",
      "KitchenQual      1456 non-null object\n",
      "TotRmsAbvGrd     1456 non-null int64\n",
      "Functional       1456 non-null object\n",
      "Fireplaces       1456 non-null int64\n",
      "FireplaceQu      1456 non-null object\n",
      "GarageType       1456 non-null object\n",
      "GarageYrBlt      1375 non-null float64\n",
      "GarageFinish     1456 non-null object\n",
      "GarageCars       1456 non-null int64\n",
      "GarageArea       1456 non-null int64\n",
      "GarageQual       1456 non-null object\n",
      "GarageCond       1456 non-null object\n",
      "PavedDrive       1456 non-null object\n",
      "WoodDeckSF       1456 non-null int64\n",
      "OpenPorchSF      1456 non-null int64\n",
      "EnclosedPorch    1456 non-null int64\n",
      "3SsnPorch        1456 non-null int64\n",
      "ScreenPorch      1456 non-null int64\n",
      "PoolArea         1456 non-null int64\n",
      "PoolQC           1456 non-null object\n",
      "Fence            1456 non-null object\n",
      "MiscFeature      1456 non-null object\n",
      "MiscVal          1456 non-null int64\n",
      "MoSold           1456 non-null int64\n",
      "YrSold           1456 non-null int64\n",
      "SaleType         1456 non-null object\n",
      "SaleCondition    1456 non-null object\n",
      "SalePrice        1456 non-null int64\n",
      "dtypes: float64(3), int64(34), object(43)\n",
      "memory usage: 921.4+ KB\n"
     ]
    }
   ],
   "source": [
    "train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def numberical2cat(df):\n",
    "    df.replace({\"MSSubClass\" : {20 : \"SC20\", 30 : \"SC30\", 40 : \"SC40\", 45 : \"SC45\", \n",
    "                                       50 : \"SC50\", 60 : \"SC60\", 70 : \"SC70\", 75 : \"SC75\", \n",
    "                                       80 : \"SC80\", 85 : \"SC85\", 90 : \"SC90\", 120 : \"SC120\", \n",
    "                                       150 : \"SC150\", 160 : \"SC160\", 180 : \"SC180\", 190 : \"SC190\"},\n",
    "                       \"MoSold\" : {1 : \"Jan\", 2 : \"Feb\", 3 : \"Mar\", 4 : \"Apr\", 5 : \"May\", 6 : \"Jun\",\n",
    "                                   7 : \"Jul\", 8 : \"Aug\", 9 : \"Sep\", 10 : \"Oct\", 11 : \"Nov\", 12 : \"Dec\"}\n",
    "                      }, inplace = True)\n",
    "\n",
    "    return df\n",
    "train = numberical2cat(train)\n",
    "test = numberical2cat(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cat2numberical(df):\n",
    "    df.replace({\"Alley\" : {\"None\":0, \"Grvl\" : 1, \"Pave\" : 2},\n",
    "                \"BsmtCond\" : {\"No\" : 0, \"Po\" : 1, \"Fa\" : 2, \"TA\" : 3, \"Gd\" : 4, \"Ex\" : 5},\n",
    "                \"BsmtExposure\" : {\"No\" : 0, \"Mn\" : 1, \"Av\": 2, \"Gd\" : 3},\n",
    "                \"BsmtFinType1\" : {\"No\" : 0, \"Unf\" : 1, \"LwQ\": 2, \"Rec\" : 3, \"BLQ\" : 4, \n",
    "                                         \"ALQ\" : 5, \"GLQ\" : 6},\n",
    "                \"BsmtFinType2\" : {\"No\" : 0, \"Unf\" : 1, \"LwQ\": 2, \"Rec\" : 3, \"BLQ\" : 4, \n",
    "                                         \"ALQ\" : 5, \"GLQ\" : 6},\n",
    "                \"BsmtQual\" : {\"No\" : 0, \"Po\" : 1, \"Fa\" : 2, \"TA\": 3, \"Gd\" : 4, \"Ex\" : 5},\n",
    "                \"ExterCond\" : {\"Po\" : 1, \"Fa\" : 2, \"TA\": 3, \"Gd\": 4, \"Ex\" : 5},\n",
    "                \"ExterQual\" : {\"Po\" : 1, \"Fa\" : 2, \"TA\": 3, \"Gd\": 4, \"Ex\" : 5},\n",
    "                \"FireplaceQu\" : {\"No\" : 0, \"Po\" : 1, \"Fa\" : 2, \"TA\" : 3, \"Gd\" : 4, \"Ex\" : 5},\n",
    "                \"Functional\" : {\"Sal\" : 1, \"Sev\" : 2, \"Maj2\" : 3, \"Maj1\" : 4, \"Mod\": 5, \n",
    "                                       \"Min2\" : 6, \"Min1\" : 7, \"Typ\" : 8},\n",
    "                \"GarageCond\" : {\"No\" : 0, \"Po\" : 1, \"Fa\" : 2, \"TA\" : 3, \"Gd\" : 4, \"Ex\" : 5},\n",
    "                \"GarageQual\" : {\"No\" : 0, \"Po\" : 1, \"Fa\" : 2, \"TA\" : 3, \"Gd\" : 4, \"Ex\" : 5},\n",
    "                \"HeatingQC\" : {\"Po\" : 1, \"Fa\" : 2, \"TA\" : 3, \"Gd\" : 4, \"Ex\" : 5},\n",
    "                \"KitchenQual\" : {\"Po\" : 1, \"Fa\" : 2, \"TA\" : 3, \"Gd\" : 4, \"Ex\" : 5},\n",
    "                \"LandSlope\" : {\"Sev\" : 1, \"Mod\" : 2, \"Gtl\" : 3},\n",
    "                \"LotShape\" : {\"IR3\" : 1, \"IR2\" : 2, \"IR1\" : 3, \"Reg\" : 4},\n",
    "                \"PavedDrive\" : {\"N\" : 0, \"P\" : 1, \"Y\" : 2},\n",
    "                \"PoolQC\" : {\"No\" : 0, \"Fa\" : 1, \"TA\" : 2, \"Gd\" : 3, \"Ex\" : 4},\n",
    "                \"Street\" : {\"Grvl\" : 1, \"Pave\" : 2},\n",
    "                \"Utilities\" : {\"ELO\" : 1, \"NoSeWa\" : 2, \"NoSewr\" : 3, \"AllPub\" : 4}},\n",
    "                       inplace = True\n",
    "                     )\n",
    "    return df\n",
    "\n",
    "train = cat2numberical(train)\n",
    "test = cat2numberical(test)\n",
    "#当顺序中有信息时，将某些分类特征编码为序号。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Numerical features : 56\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 1456 entries, 0 to 1459\n",
      "Data columns (total 80 columns):\n",
      "MSSubClass       1456 non-null int64\n",
      "MSZoning         1456 non-null object\n",
      "LotFrontage      1456 non-null float64\n",
      "LotArea          1456 non-null int64\n",
      "Street           1456 non-null int64\n",
      "Alley            1456 non-null int64\n",
      "LotShape         1456 non-null int64\n",
      "LandContour      1456 non-null object\n",
      "Utilities        1456 non-null int64\n",
      "LotConfig        1456 non-null object\n",
      "LandSlope        1456 non-null int64\n",
      "Neighborhood     1456 non-null object\n",
      "Condition1       1456 non-null object\n",
      "Condition2       1456 non-null object\n",
      "BldgType         1456 non-null object\n",
      "HouseStyle       1456 non-null object\n",
      "OverallQual      1456 non-null int64\n",
      "OverallCond      1456 non-null int64\n",
      "YearBuilt        1456 non-null int64\n",
      "YearRemodAdd     1456 non-null int64\n",
      "RoofStyle        1456 non-null object\n",
      "RoofMatl         1456 non-null object\n",
      "Exterior1st      1456 non-null object\n",
      "Exterior2nd      1456 non-null object\n",
      "MasVnrType       1456 non-null object\n",
      "MasVnrArea       1456 non-null float64\n",
      "ExterQual        1456 non-null int64\n",
      "ExterCond        1456 non-null int64\n",
      "Foundation       1456 non-null object\n",
      "BsmtQual         1456 non-null int64\n",
      "BsmtCond         1456 non-null int64\n",
      "BsmtExposure     1456 non-null int64\n",
      "BsmtFinType1     1456 non-null int64\n",
      "BsmtFinSF1       1456 non-null int64\n",
      "BsmtFinType2     1456 non-null int64\n",
      "BsmtFinSF2       1456 non-null int64\n",
      "BsmtUnfSF        1456 non-null int64\n",
      "TotalBsmtSF      1456 non-null int64\n",
      "Heating          1456 non-null object\n",
      "HeatingQC        1456 non-null int64\n",
      "CentralAir       1456 non-null object\n",
      "Electrical       1455 non-null object\n",
      "1stFlrSF         1456 non-null int64\n",
      "2ndFlrSF         1456 non-null int64\n",
      "LowQualFinSF     1456 non-null int64\n",
      "GrLivArea        1456 non-null int64\n",
      "BsmtFullBath     1456 non-null int64\n",
      "BsmtHalfBath     1456 non-null int64\n",
      "FullBath         1456 non-null int64\n",
      "HalfBath         1456 non-null int64\n",
      "BedroomAbvGr     1456 non-null int64\n",
      "KitchenAbvGr     1456 non-null int64\n",
      "KitchenQual      1456 non-null int64\n",
      "TotRmsAbvGrd     1456 non-null int64\n",
      "Functional       1456 non-null int64\n",
      "Fireplaces       1456 non-null int64\n",
      "FireplaceQu      1456 non-null int64\n",
      "GarageType       1456 non-null object\n",
      "GarageYrBlt      1375 non-null float64\n",
      "GarageFinish     1456 non-null object\n",
      "GarageCars       1456 non-null int64\n",
      "GarageArea       1456 non-null int64\n",
      "GarageQual       1456 non-null int64\n",
      "GarageCond       1456 non-null int64\n",
      "PavedDrive       1456 non-null int64\n",
      "WoodDeckSF       1456 non-null int64\n",
      "OpenPorchSF      1456 non-null int64\n",
      "EnclosedPorch    1456 non-null int64\n",
      "3SsnPorch        1456 non-null int64\n",
      "ScreenPorch      1456 non-null int64\n",
      "PoolArea         1456 non-null int64\n",
      "PoolQC           1456 non-null int64\n",
      "Fence            1456 non-null object\n",
      "MiscFeature      1456 non-null object\n",
      "MiscVal          1456 non-null int64\n",
      "MoSold           1456 non-null int64\n",
      "YrSold           1456 non-null int64\n",
      "SaleType         1456 non-null object\n",
      "SaleCondition    1456 non-null object\n",
      "SalePrice        1456 non-null int64\n",
      "dtypes: float64(3), int64(54), object(23)\n",
      "memory usage: 921.4+ KB\n",
      "NAs for numerical features in df : 81\n",
      "Remaining NAs for numerical features in df : 0\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "def fillna_numerical_train(df):\n",
    "    numerical_features = df.select_dtypes(exclude = [\"object\"]).columns\n",
    "    \n",
    "    numerical_features = numerical_features.drop(\"SalePrice\")\n",
    "    print(\"Numerical features : \" + str(len(numerical_features)))\n",
    "\n",
    "    df.info()\n",
    "    df_num = df[numerical_features]\n",
    "    #df_num.info()\n",
    "    \n",
    "    medians = df_num.median() \n",
    "    # Handle remaining missing values for numerical features by using median as replacement\n",
    "    print(\"NAs for numerical features in df : \" + str(df_num.isnull().values.sum()))\n",
    "    df_num = df_num.fillna(medians)\n",
    "    print(\"Remaining NAs for numerical features in df : \" + str(df_num.isnull().values.sum()))\n",
    "\n",
    "    #df_num.info()\n",
    "    # 分别初始化对特征和目标值的标准化器\n",
    "    ss_X = StandardScaler()\n",
    "\n",
    "    # 对训练特征进行标准化处理\n",
    "    temp = ss_X.fit_transform(df_num)\n",
    "    df_num = pd.DataFrame(data=temp, columns=numerical_features, index =df_num.index)\n",
    "    \n",
    "    return df_num, medians, ss_X\n",
    "\n",
    "train_num, medians, ss_X = fillna_numerical_train(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Numerical features : 56\n",
      "NAs for numerical features in df : 81\n",
      "Remaining NAs for numerical features in df : 0\n"
     ]
    }
   ],
   "source": [
    "def fillna_numerical_test(df, medians, ss_X):\n",
    "    numerical_features = df.select_dtypes(exclude = [\"object\"]).columns\n",
    "    #测试集中没有SalePrice，这个是需要进行预测的y值\n",
    "    print(\"Numerical features : \" + str(len(numerical_features)))\n",
    "\n",
    "    df_num = df[numerical_features]\n",
    "    \n",
    "    #用中位数作替换处理数值特征的剩余缺失值\n",
    "    print(\"NAs for numerical features in df : \" + str(df_num.isnull().values.sum()))\n",
    "    df_num = df_num.fillna(medians)\n",
    "    print(\"Remaining NAs for numerical features in df : \" + str(df_num.isnull().values.sum()))\n",
    "\n",
    "    #对数值特征进行标准化\n",
    "    temp = ss_X.transform(df_num)\n",
    "    df_num = pd.DataFrame(data=temp, columns=numerical_features, index =df_num.index )\n",
    "    return df_num\n",
    "\n",
    "test_num = fillna_numerical_test(test, medians, ss_X)\n",
    "#对测试集的其他数值型特征进行空缺值填补（用训练集中相对应的列的中值进行填补）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\AI\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:19: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
      "of pandas will change to not sort by default.\n",
      "\n",
      "To accept the future behavior, pass 'sort=True'.\n",
      "\n",
      "To retain the current behavior and silence the warning, pass sort=False\n",
      "\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Categorical features : 23\n",
      "NAs for categorical features in df : 8\n",
      "Remaining NAs for categorical features in df : 0\n"
     ]
    }
   ],
   "source": [
    "def get_dummies_cat(df):\n",
    "    categorical_features = df.select_dtypes(include = [\"object\"]).columns\n",
    "    print(\"Categorical features : \" + str(len(categorical_features)))\n",
    "    df_cat = df[categorical_features]\n",
    "    \n",
    "\n",
    "    # Create dummy features for categorical values via one-hot encoding\n",
    "    print(\"NAs for categorical features in df : \" + str(df_cat.isnull().values.sum()))\n",
    "    df_cat = pd.get_dummies(df_cat,dummy_na=True)\n",
    "    print(\"Remaining NAs for categorical features in df : \" + str(df_cat.isnull().values.sum()))\n",
    "    \n",
    "    return df_cat\n",
    "\n",
    "#必须考虑类别型特征的取值范围（训练集和测试的取值范围可能不同）\n",
    "#train_cat = get_dummies_cat(train)\n",
    "#test_cat = get_dummies_cat(test)\n",
    "\n",
    "n_train_samples = train.shape[0]  \n",
    "train_test = pd.concat((train, test), axis=0)\n",
    "train_test_cat = get_dummies_cat(train_test)\n",
    "   \n",
    "train_cat = train_test_cat.iloc[:n_train_samples, :]\n",
    "test_cat = train_test_cat.iloc[n_train_samples:, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 1456 entries, 0 to 1459\n",
      "Columns: 195 entries, BldgType_1Fam to SaleType_nan\n",
      "dtypes: uint8(195)\n",
      "memory usage: 288.6 KB\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>BldgType_1Fam</th>\n",
       "      <th>BldgType_2fmCon</th>\n",
       "      <th>BldgType_Duplex</th>\n",
       "      <th>BldgType_Twnhs</th>\n",
       "      <th>BldgType_TwnhsE</th>\n",
       "      <th>BldgType_nan</th>\n",
       "      <th>CentralAir_N</th>\n",
       "      <th>CentralAir_Y</th>\n",
       "      <th>CentralAir_nan</th>\n",
       "      <th>Condition1_Artery</th>\n",
       "      <th>...</th>\n",
       "      <th>SaleType_COD</th>\n",
       "      <th>SaleType_CWD</th>\n",
       "      <th>SaleType_Con</th>\n",
       "      <th>SaleType_ConLD</th>\n",
       "      <th>SaleType_ConLI</th>\n",
       "      <th>SaleType_ConLw</th>\n",
       "      <th>SaleType_New</th>\n",
       "      <th>SaleType_Oth</th>\n",
       "      <th>SaleType_WD</th>\n",
       "      <th>SaleType_nan</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 195 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   BldgType_1Fam  BldgType_2fmCon  BldgType_Duplex  BldgType_Twnhs  \\\n",
       "0              1                0                0               0   \n",
       "1              1                0                0               0   \n",
       "2              1                0                0               0   \n",
       "3              1                0                0               0   \n",
       "4              1                0                0               0   \n",
       "\n",
       "   BldgType_TwnhsE  BldgType_nan  CentralAir_N  CentralAir_Y  CentralAir_nan  \\\n",
       "0                0             0             0             1               0   \n",
       "1                0             0             0             1               0   \n",
       "2                0             0             0             1               0   \n",
       "3                0             0             0             1               0   \n",
       "4                0             0             0             1               0   \n",
       "\n",
       "   Condition1_Artery      ...       SaleType_COD  SaleType_CWD  SaleType_Con  \\\n",
       "0                  0      ...                  0             0             0   \n",
       "1                  0      ...                  0             0             0   \n",
       "2                  0      ...                  0             0             0   \n",
       "3                  0      ...                  0             0             0   \n",
       "4                  0      ...                  0             0             0   \n",
       "\n",
       "   SaleType_ConLD  SaleType_ConLI  SaleType_ConLw  SaleType_New  SaleType_Oth  \\\n",
       "0               0               0               0             0             0   \n",
       "1               0               0               0             0             0   \n",
       "2               0               0               0             0             0   \n",
       "3               0               0               0             0             0   \n",
       "4               0               0               0             0             0   \n",
       "\n",
       "   SaleType_WD  SaleType_nan  \n",
       "0            1             0  \n",
       "1            1             0  \n",
       "2            1             0  \n",
       "3            1             0  \n",
       "4            1             0  \n",
       "\n",
       "[5 rows x 195 columns]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_cat.info()\n",
    "train_cat.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "New number of features : 251\n",
      "New number of features : 251\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 1456 entries, 0 to 1459\n",
      "Columns: 252 entries, 0 to SalePrice\n",
      "dtypes: float64(56), int64(1), uint8(195)\n",
      "memory usage: 937.0 KB\n"
     ]
    }
   ],
   "source": [
    "def joint_num_cat(df_num, df_cat):\n",
    "    df = pd.concat([df_num, df_cat], axis = 1, ignore_index=True)\n",
    "    print(\"New number of features : \" + str(df.shape[1]))\n",
    "    \n",
    "    return df\n",
    "\n",
    "FE_train = joint_num_cat(train_num, train_cat)\n",
    "FE_test = joint_num_cat(test_num, test_cat)\n",
    "\n",
    "FE_train = pd.concat([FE_train, train['SalePrice']], axis = 1)\n",
    "FE_test = pd.concat([test_id,FE_test], axis = 1)\n",
    "\n",
    "FE_train.to_csv('AmesHouse_FE_train.csv', index=False)\n",
    "FE_test.to_csv('AmesHouse_FE_test.csv', index=False)\n",
    "FE_train.info()\n",
    "#把处理过的训练以及测试数据都导出。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
